Docs: Improved tokenizer docs (#18356)
* Docs: Improved tokenizer docs Added descriptions and runnable examples * Addressed Nik's comments * Added TESTRESPONSEs for all tokenizer examples * Added TESTRESPONSEs for all analyzer examples too * Added docs, examples, and TESTRESPONSES for character filters * Skipping two tests: One interprets "$1" as a stack variable - same problem exists with the REST tests The other because the "took" value is always different * Fixed tests with "took" * Fixed failing tests and removed preserve_original from fingerprint analyzer
This commit is contained in:
parent
dc33a83231
commit
5da9e5dcbc
|
@ -64,3 +64,38 @@ POST my_index/_analyze
|
|||
English stop words will be removed. The resulting terms are:
|
||||
`[ old, brown, cow ]`
|
||||
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "old",
|
||||
"start_offset": 4,
|
||||
"end_offset": 7,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 8,
|
||||
"end_offset": 13,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "cow",
|
||||
"start_offset": 14,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
|
|
@ -84,6 +84,48 @@ POST my_index/_analyze
|
|||
--------------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "is",
|
||||
"start_offset": 0,
|
||||
"end_offset": 2,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "this",
|
||||
"start_offset": 3,
|
||||
"end_offset": 7,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "deja",
|
||||
"start_offset": 11,
|
||||
"end_offset": 15,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "vu",
|
||||
"start_offset": 16,
|
||||
"end_offset": 22,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
|
@ -119,13 +161,10 @@ PUT my_index
|
|||
"analyzer": {
|
||||
"my_custom_analyzer": {
|
||||
"type": "custom",
|
||||
|
||||
"char_filter": [
|
||||
"emoticons" <1>
|
||||
],
|
||||
|
||||
"tokenizer": "punctuation", <1>
|
||||
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"english_stop" <1>
|
||||
|
@ -165,11 +204,54 @@ POST my_index/_analyze
|
|||
"text": "I'm a :) person, and you?"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
<1> The `emoticon` character filter, `punctuation` tokenizer and
|
||||
`english_stop` token filter are custom implementations which are defined
|
||||
in the same index settings.
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "i'm",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "_happy_",
|
||||
"start_offset": 6,
|
||||
"end_offset": 8,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "person",
|
||||
"start_offset": 9,
|
||||
"end_offset": 15,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "you",
|
||||
"start_offset": 21,
|
||||
"end_offset": 24,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -36,6 +36,27 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "and consistent godel is said sentence this yes",
|
||||
"start_offset": 0,
|
||||
"end_offset": 52,
|
||||
"type": "fingerprint",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following single term:
|
||||
|
||||
[source,text]
|
||||
|
@ -58,16 +79,11 @@ The `fingerprint` analyzer accepts the following parameters:
|
|||
The maximum token size to emit. Defaults to `255`. Tokens larger than
|
||||
this size will be discarded.
|
||||
|
||||
`preserve_original`::
|
||||
|
||||
If `true`, emits two tokens: one with ASCII-folding of terms that contain
|
||||
extended characters (if any) and one with the original characters.
|
||||
Defaults to `false`.
|
||||
|
||||
`stopwords`::
|
||||
|
||||
A pre-defined stop words list like `_english_` or an array containing a
|
||||
list of stop words. Defaults to `_none_`.
|
||||
|
||||
`stopwords_path`::
|
||||
|
||||
The path to a file containing stop words.
|
||||
|
@ -80,8 +96,7 @@ about stop word configuration.
|
|||
=== Example configuration
|
||||
|
||||
In this example, we configure the `fingerprint` analyzer to use the
|
||||
pre-defined list of English stop words, and to emit a second token in
|
||||
the presence of non-ASCII characters:
|
||||
pre-defined list of English stop words:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
|
@ -92,8 +107,7 @@ PUT my_index
|
|||
"analyzer": {
|
||||
"my_fingerprint_analyzer": {
|
||||
"type": "fingerprint",
|
||||
"stopwords": "_english_",
|
||||
"preserve_original": true
|
||||
"stopwords": "_english_"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -110,9 +124,30 @@ POST my_index/_analyze
|
|||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
The above example produces the following two terms:
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "consistent godel said sentence yes",
|
||||
"start_offset": 0,
|
||||
"end_offset": 52,
|
||||
"type": "fingerprint",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following term:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ consistent godel said sentence yes, consistent gödel said sentence yes ]
|
||||
[ consistent godel said sentence yes ]
|
||||
---------------------------
|
||||
|
|
|
@ -25,6 +25,27 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
|
||||
"start_offset": 0,
|
||||
"end_offset": 56,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following single term:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -30,6 +30,104 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "dog",
|
||||
"start_offset": 45,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 49,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 11
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
|
@ -110,6 +208,55 @@ POST my_index/_analyze
|
|||
<1> The backslashes in the pattern need to be escaped when specifying the
|
||||
pattern as a JSON string.
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "john",
|
||||
"start_offset": 0,
|
||||
"end_offset": 4,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "smith",
|
||||
"start_offset": 5,
|
||||
"end_offset": 10,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "foo",
|
||||
"start_offset": 11,
|
||||
"end_offset": 14,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "bar",
|
||||
"start_offset": 15,
|
||||
"end_offset": 18,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "com",
|
||||
"start_offset": 19,
|
||||
"end_offset": 22,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
|
@ -148,6 +295,62 @@ GET my_index/_analyze
|
|||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "moose",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "x",
|
||||
"start_offset": 5,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "ftp",
|
||||
"start_offset": 8,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "class",
|
||||
"start_offset": 11,
|
||||
"end_offset": 16,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 16,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "beta",
|
||||
"start_offset": 18,
|
||||
"end_offset": 22,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -25,6 +25,97 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog",
|
||||
"start_offset": 45,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 49,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -33,6 +33,97 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "<NUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
|
@ -98,6 +189,89 @@ POST my_index/_analyze
|
|||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "<NUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "jumpe",
|
||||
"start_offset": 24,
|
||||
"end_offset": 29,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "d",
|
||||
"start_offset": 29,
|
||||
"end_offset": 30,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 10
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 11
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -29,6 +29,83 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog",
|
||||
"start_offset": 45,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 49,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
|
@ -87,6 +164,76 @@ POST my_index/_analyze
|
|||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog",
|
||||
"start_offset": 45,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 49,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -25,6 +25,90 @@ POST _analyze
|
|||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Brown-Foxes",
|
||||
"start_offset": 12,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "bone.",
|
||||
"start_offset": 51,
|
||||
"end_offset": 56,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
|
|
|
@ -1,16 +1,36 @@
|
|||
[[analysis-charfilters]]
|
||||
== Character Filters
|
||||
|
||||
Character filters are used to preprocess the string of
|
||||
characters before it is passed to the <<analysis-tokenizers,tokenizer>>.
|
||||
A character filter may be used to strip out HTML markup, or to convert
|
||||
`"&"` characters to the word `"and"`.
|
||||
_Character filters_ are used to preprocess the stream of characters before it
|
||||
is passed to the <<analysis-tokenizers,tokenizer>>.
|
||||
|
||||
Elasticsearch has built in characters filters which can be
|
||||
used to build <<analysis-custom-analyzer,custom analyzers>>.
|
||||
A character filter receives the original text as a stream of characters and
|
||||
can transform the stream by adding, removing, or changing characters. For
|
||||
instance, a character filter could be used to convert Arabic numerals
|
||||
(٠١٢٣٤٥٦٧٨٩) into their Latin equivalents (0123456789), or to strip HTML
|
||||
elements like `<b>` from the stream.
|
||||
|
||||
include::charfilters/mapping-charfilter.asciidoc[]
|
||||
|
||||
Elasticsearch has a number of built in character filters which can be used to build
|
||||
<<analysis-custom-analyzer,custom analyzers>>.
|
||||
|
||||
<<analysis-htmlstrip-charfilter,HTML Strip Character Filter>>::
|
||||
|
||||
The `html_strip` character filter strips out HTML elements like `<b>` and
|
||||
decodes HTML entities like `&`.
|
||||
|
||||
<<analysis-mapping-charfilter,Mapping Character Filter>>::
|
||||
|
||||
The `mapping` character filter replaces any occurrences of the specified
|
||||
strings with the specified replacements.
|
||||
|
||||
<<analysis-pattern-replace-charfilter,Pattern Replace Character Filter>>::
|
||||
|
||||
The `pattern_replace` character filter replaces any characters matching a
|
||||
regular expression with the specified replacement.
|
||||
|
||||
include::charfilters/htmlstrip-charfilter.asciidoc[]
|
||||
|
||||
include::charfilters/mapping-charfilter.asciidoc[]
|
||||
|
||||
include::charfilters/pattern-replace-charfilter.asciidoc[]
|
||||
|
|
|
@ -1,5 +1,135 @@
|
|||
[[analysis-htmlstrip-charfilter]]
|
||||
=== HTML Strip Char Filter
|
||||
|
||||
A char filter of type `html_strip` stripping out HTML elements from an
|
||||
analyzed text.
|
||||
The `html_strip` character filter strips HTML elements from the text and
|
||||
replaces HTML entities with their decoded value (e.g. replacing `&` with
|
||||
`&`).
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "keyword", <1>
|
||||
"char_filter": [ "html_strip" ],
|
||||
"text": "<p>I'm so <b>happy</b>!</p>"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
<1> The <<analysis-keyword-tokenizer,`keyword` tokenizer>> returns a single term.
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "\nI'm so happy!\n",
|
||||
"start_offset": 0,
|
||||
"end_offset": 32,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example returns the term:
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
[ \nI'm so happy!\n ]
|
||||
---------------------------
|
||||
|
||||
The same example with the `standard` tokenizer would return the following terms:
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
[ I'm, so, happy ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `html_strip` character filter accepts the following parameter:
|
||||
|
||||
[horizontal]
|
||||
`escaped_tags`::
|
||||
|
||||
An array of HTML tags which should not be stripped from the original text.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `html_strip` character filter to leave `<b>`
|
||||
tags in place:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "keyword",
|
||||
"char_filter": ["my_char_filter"]
|
||||
}
|
||||
},
|
||||
"char_filter": {
|
||||
"my_char_filter": {
|
||||
"type": "html_strip",
|
||||
"escaped_tags": ["b"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "<p>I'm so <b>happy</b>!</p>"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "\nI'm so <b>happy</b>!\n",
|
||||
"start_offset": 0,
|
||||
"end_offset": 32,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following term:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ \nI'm so <b>happy</b>!\n ]
|
||||
---------------------------
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,42 +1,202 @@
|
|||
[[analysis-mapping-charfilter]]
|
||||
=== Mapping Char Filter
|
||||
|
||||
A char filter of type `mapping` replacing characters of an analyzed text
|
||||
with given mapping.
|
||||
The `mapping` character filter accepts a map of keys and values. Whenever it
|
||||
encounters a string of characters that is the same as a key, it replaces them
|
||||
with the value associated with that key.
|
||||
|
||||
Matching is greedy; the longest pattern matching at a given point wins.
|
||||
Replacements are allowed to be the empty string.
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `mapping` character filter accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`mappings`::
|
||||
|
||||
A list of mappings to use.
|
||||
A array of mappings, with each element having the form `key => value`.
|
||||
|
||||
`mappings_path`::
|
||||
|
||||
A path, relative to the `config` directory, to a mappings file
|
||||
configuration.
|
||||
A path, either absolute or relative to the `config` directory, to a UTF-8
|
||||
encoded text mappings file containing a `key => value` mapping per line.
|
||||
|
||||
Here is a sample configuration:
|
||||
Either the `mappings` or `mappings_path` parameter must be provided.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `mapping` character filter to replace Arabic
|
||||
numerals with their Latin equivalents:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"char_filter" : {
|
||||
"my_mapping" : {
|
||||
"type" : "mapping",
|
||||
"mappings" : [
|
||||
"ph => f",
|
||||
"qu => k"
|
||||
]
|
||||
}
|
||||
},
|
||||
"analyzer" : {
|
||||
"custom_with_char_filter" : {
|
||||
"tokenizer" : "standard",
|
||||
"char_filter" : ["my_mapping"]
|
||||
}
|
||||
}
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "keyword",
|
||||
"char_filter": [
|
||||
"my_char_filter"
|
||||
]
|
||||
}
|
||||
},
|
||||
"char_filter": {
|
||||
"my_char_filter": {
|
||||
"type": "mapping",
|
||||
"mappings": [
|
||||
"٠ => 0",
|
||||
"١ => 1",
|
||||
"٢ => 2",
|
||||
"٣ => 3",
|
||||
"٤ => 4",
|
||||
"٥ => 5",
|
||||
"٦ => 6",
|
||||
"٧ => 7",
|
||||
"٨ => 8",
|
||||
"٩ => 9"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "My license plate is ٢٥٠١٥"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "My license plate is 25015",
|
||||
"start_offset": 0,
|
||||
"end_offset": 25,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following term:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ My license plate is 25015 ]
|
||||
---------------------------
|
||||
|
||||
Keys and values can be strings with multiple characters. The following
|
||||
example replaces the `:)` and `:(` emoticons with a text equivalent:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "standard",
|
||||
"char_filter": [
|
||||
"my_char_filter"
|
||||
]
|
||||
}
|
||||
},
|
||||
"char_filter": {
|
||||
"my_char_filter": {
|
||||
"type": "mapping",
|
||||
"mappings": [
|
||||
":) => _happy_",
|
||||
":( => _sad_"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "I'm delighted about it :("
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "I'm",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "delighted",
|
||||
"start_offset": 4,
|
||||
"end_offset": 13,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "about",
|
||||
"start_offset": 14,
|
||||
"end_offset": 19,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "it",
|
||||
"start_offset": 20,
|
||||
"end_offset": 22,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "_sad_",
|
||||
"start_offset": 23,
|
||||
"end_offset": 25,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ I'm, delighted, about, it, _sad_ ]
|
||||
---------------------------
|
||||
|
|
|
@ -1,37 +1,249 @@
|
|||
[[analysis-pattern-replace-charfilter]]
|
||||
=== Pattern Replace Char Filter
|
||||
|
||||
The `pattern_replace` char filter allows the use of a regex to
|
||||
manipulate the characters in a string before analysis. The regular
|
||||
expression is defined using the `pattern` parameter, and the replacement
|
||||
string can be provided using the `replacement` parameter (supporting
|
||||
referencing the original text, as explained
|
||||
http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]).
|
||||
For more information check the
|
||||
http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.html[lucene
|
||||
documentation]
|
||||
The `pattern_replace` character filter uses a regular expression to match
|
||||
characters which should be replaced with the specified replacement string.
|
||||
The replacement string can refer to capture groups in the regular expression.
|
||||
|
||||
Here is a sample configuration:
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `pattern_replace` character filter accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`pattern`::
|
||||
|
||||
A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression]. Required.
|
||||
|
||||
`replacement`::
|
||||
|
||||
The replacement string, which can reference capture groups using the
|
||||
`$1`..`$9` syntax, as explained
|
||||
http://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#appendReplacement-java.lang.StringBuffer-java.lang.String-[here].
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `pattern_replace` character filter to
|
||||
replace any embedded dashes in numbers with underscores, i.e `123-456-789` ->
|
||||
`123_456_789`:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"char_filter" : {
|
||||
"my_pattern":{
|
||||
"type":"pattern_replace",
|
||||
"pattern":"sample(.*)",
|
||||
"replacement":"replacedSample $1"
|
||||
}
|
||||
},
|
||||
"analyzer" : {
|
||||
"custom_with_char_filter" : {
|
||||
"tokenizer" : "standard",
|
||||
"char_filter" : ["my_pattern"]
|
||||
}
|
||||
}
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "standard",
|
||||
"char_filter": [
|
||||
"my_char_filter"
|
||||
]
|
||||
}
|
||||
},
|
||||
"char_filter": {
|
||||
"my_char_filter": {
|
||||
"type": "pattern_replace",
|
||||
"pattern": "(\\d+)-(?=\\d)",
|
||||
"replacement": "$1_"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "My credit card is 123-456-789"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
// TEST[skip:Test interprets $1 as a stashed variable]
|
||||
|
||||
The above example produces the following term:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ My, credit, card, is 123_456_789 ]
|
||||
---------------------------
|
||||
|
||||
|
||||
WARNING: Using a replacement string that changes the length of the original
|
||||
text will work for search purposes, but will result in incorrect highlighting,
|
||||
as can be seen in the following example.
|
||||
|
||||
This example inserts a space whenever it encounters a lower-case letter
|
||||
followed by an upper-case letter (i.e. `fooBarBaz` -> `foo Bar Baz`), allowing
|
||||
camelCase words to be queried individually:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "standard",
|
||||
"char_filter": [
|
||||
"my_char_filter"
|
||||
],
|
||||
"filter": [
|
||||
"lowercase"
|
||||
]
|
||||
}
|
||||
},
|
||||
"char_filter": {
|
||||
"my_char_filter": {
|
||||
"type": "pattern_replace",
|
||||
"pattern": "(?<=\\p{Lower})(?=\\p{Upper})",
|
||||
"replacement": " "
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "text",
|
||||
"analyzer": "my_analyzer"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "The fooBarBaz method"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "foo",
|
||||
"start_offset": 4,
|
||||
"end_offset": 6,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "bar",
|
||||
"start_offset": 7,
|
||||
"end_offset": 9,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "baz",
|
||||
"start_offset": 10,
|
||||
"end_offset": 13,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "method",
|
||||
"start_offset": 14,
|
||||
"end_offset": 20,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
The above returns the following terms:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
[ the, foo, bar, baz, method ]
|
||||
----------------------------
|
||||
|
||||
Querying for `bar` will find the document correctly, but highlighting on the
|
||||
result will produce incorrect highlights, because our character filter changed
|
||||
the length of the original text:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index/my_doc/1?refresh
|
||||
{
|
||||
"text": "The fooBarBaz method"
|
||||
}
|
||||
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"match": {
|
||||
"text": "bar"
|
||||
}
|
||||
},
|
||||
"highlight": {
|
||||
"fields": {
|
||||
"text": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
// TEST[continued]
|
||||
|
||||
The output from the above is:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"timed_out": false,
|
||||
"took": $body.took,
|
||||
"_shards": {
|
||||
"total": 5,
|
||||
"successful": 5,
|
||||
"failed": 0
|
||||
},
|
||||
"hits": {
|
||||
"total": 1,
|
||||
"max_score": 0.4375,
|
||||
"hits": [
|
||||
{
|
||||
"_index": "my_index",
|
||||
"_type": "my_doc",
|
||||
"_id": "1",
|
||||
"_score": 0.4375,
|
||||
"_source": {
|
||||
"text": "The fooBarBaz method"
|
||||
},
|
||||
"highlight": {
|
||||
"text": [
|
||||
"The foo<em>Ba</em>rBaz method" <1>
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE[s/"took".*/"took": "$body.took",/]
|
||||
<1> Note the incorrect highlight.
|
||||
|
|
|
@ -1,34 +1,136 @@
|
|||
[[analysis-tokenizers]]
|
||||
== Tokenizers
|
||||
|
||||
Tokenizers are used to break a string down into a stream of terms
|
||||
or tokens. A simple tokenizer might split the string up into terms
|
||||
wherever it encounters whitespace or punctuation.
|
||||
A _tokenizer_ receives a stream of characters, breaks it up into individual
|
||||
_tokens_ (usually individual words), and outputs a stream of _tokens_. For
|
||||
instance, a <<analysis-whitespace-tokenizer,`whitespace`>> tokenizer breaks
|
||||
text into tokens whenever it sees any whitespace. It would convert the text
|
||||
`"Quick brown fox!"` into the terms `[Quick, brown, fox!]`.
|
||||
|
||||
The tokenizer is also responsible for recording the order or _position_ of
|
||||
each term (used for phrase and word proximity queries) and the start and end
|
||||
_character offsets_ of the original word which the term represents (used for
|
||||
highlighting search snippets).
|
||||
|
||||
Elasticsearch has a number of built in tokenizers which can be used to build
|
||||
<<analysis-custom-analyzer,custom analyzers>>.
|
||||
|
||||
[float]
|
||||
=== Word Oriented Tokenizers
|
||||
|
||||
The following tokenizers are usually used for tokenizing full text into
|
||||
individual words:
|
||||
|
||||
<<analysis-standard-tokenizer,Standard Tokenizer>>::
|
||||
|
||||
The `standard` tokenizer divides text into terms on word boundaries, as
|
||||
defined by the Unicode Text Segmentation algorithm. It removes most
|
||||
punctuation symbols. It is the best choice for most languages.
|
||||
|
||||
<<analysis-letter-tokenizer,Letter Tokenizer>>::
|
||||
|
||||
The `letter` tokenizer divides text into terms whenever it encounters a
|
||||
character which is not a letter.
|
||||
|
||||
<<analysis-letter-tokenizer,Lowercase Tokenizer>>::
|
||||
|
||||
The `lowercase` tokenizer, like the `letter` tokenizer, divides text into
|
||||
terms whenever it encounters a character which is not a letter, but it also
|
||||
lowercases all terms.
|
||||
|
||||
<<analysis-whitespace-tokenizer,Whitespace Tokenizer>>::
|
||||
|
||||
The `whitespace` tokenizer divides text into terms whenever it encounters any
|
||||
whitespace character.
|
||||
|
||||
<<analysis-uaxurlemail-tokenizer,UAX URL Email Tokenizer>>::
|
||||
|
||||
The `uax_url_email` tokenizer is like the `standard` tokenizer except that it
|
||||
recognises URLs and email addresses as single tokens.
|
||||
|
||||
<<analysis-classic-tokenizer,Classic Tokenizer>>::
|
||||
|
||||
The `classic` tokenizer is a grammar based tokenizer for the English Language.
|
||||
|
||||
<<analysis-thai-tokenizer,Thai Tokenizer>>::
|
||||
|
||||
The `thai` tokenizer segments Thai text into words.
|
||||
|
||||
[float]
|
||||
=== Partial Word Tokenizers
|
||||
|
||||
These tokenizers break up text or words into small fragments, for partial word
|
||||
matching:
|
||||
|
||||
<<analysis-ngram-tokenizer,N-Gram Tokenizer>>::
|
||||
|
||||
The `ngram` tokenizer can break up text into words when it encounters any of
|
||||
a list of specified characters (e.g. whitespace or punctuation), then it returns
|
||||
n-grams of each word: a sliding window of continuous letters, e.g. `quick` ->
|
||||
`[qu, ui, ic, ck]`.
|
||||
|
||||
<<analysis-edgengram-tokenizer,Edge N-Gram Tokenizer>>::
|
||||
|
||||
The `edge_ngram` tokenizer can break up text into words when it encounters any of
|
||||
a list of specified characters (e.g. whitespace or punctuation), then it returns
|
||||
n-grams of each word which are anchored to the start of the word, e.g. `quick` ->
|
||||
`[q, qu, qui, quic, quick]`.
|
||||
|
||||
|
||||
[float]
|
||||
=== Structured Text Tokenizers
|
||||
|
||||
The following tokenizers are usually used with structured text like
|
||||
identifiers, email addresses, zip codes, and paths, rather than with full
|
||||
text:
|
||||
|
||||
<<analysis-keyword-tokenizer,Keyword Tokenizer>>::
|
||||
|
||||
The `keyword` tokenizer is a ``noop'' tokenizer that accepts whatever text it
|
||||
is given and outputs the exact same text as a single term. It can be combined
|
||||
with token filters like <<analysis-lowercase-tokenfilter,`lowercase`>> to
|
||||
normalise the analysed terms.
|
||||
|
||||
<<analysis-pattern-tokenizer,Pattern Tokenizer>>::
|
||||
|
||||
The `pattern` tokenizer uses a regular expression to either split text into
|
||||
terms whenever it matches a word separator, or to capture matching text as
|
||||
terms.
|
||||
|
||||
<<analysis-pathhierarchy-tokenizer,Path Tokenizer>>::
|
||||
|
||||
The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
|
||||
path, splits on the path separator, and emits a term for each component in the
|
||||
tree, e.g. `/foo/bar/baz` -> `[/foo, /foo/bar, /foo/bar/baz ]`.
|
||||
|
||||
|
||||
|
||||
|
||||
Elasticsearch has a number of built in tokenizers which can be
|
||||
used to build <<analysis-custom-analyzer,custom analyzers>>.
|
||||
|
||||
include::tokenizers/standard-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/edgengram-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/keyword-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/letter-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/lowercase-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/ngram-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/whitespace-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/pattern-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/uaxurlemail-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/classic-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/thai-tokenizer.asciidoc[]
|
||||
|
||||
|
||||
include::tokenizers/ngram-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/edgengram-tokenizer.asciidoc[]
|
||||
|
||||
|
||||
include::tokenizers/keyword-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/pattern-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
|
||||
|
||||
|
||||
|
|
|
@ -1,19 +1,269 @@
|
|||
[[analysis-classic-tokenizer]]
|
||||
=== Classic Tokenizer
|
||||
|
||||
A tokenizer of type `classic` providing grammar based tokenizer that is
|
||||
a good tokenizer for English language documents. This tokenizer has
|
||||
heuristics for special treatment of acronyms, company names, email addresses,
|
||||
and internet host names. However, these rules don't always work, and
|
||||
the tokenizer doesn't work well for most languages other than English.
|
||||
The `classic` tokenizer is a grammar based tokenizer that is good for English
|
||||
language documents. This tokenizer has heuristics for special treatment of
|
||||
acronyms, company names, email addresses, and internet host names. However,
|
||||
these rules don't always work, and the tokenizer doesn't work well for most
|
||||
languages other than English:
|
||||
|
||||
* It splits words at most punctuation characters, removing punctuation. However, a
|
||||
dot that's not followed by whitespace is considered part of a token.
|
||||
|
||||
* It splits words at hyphens, unless there's a number in the token, in which case
|
||||
the whole token is interpreted as a product number and is not split.
|
||||
|
||||
* It recognizes email addresses and internet hostnames as one token.
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "classic",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "Foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "<APOSTROPHE>",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `classic` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`max_token_length`::
|
||||
|
||||
The maximum token length. If a token is seen that exceeds this length then
|
||||
it is split at `max_token_length` intervals. Defaults to `255`.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `classic` tokenizer to have a
|
||||
`max_token_length` of 5 (for demonstration purposes):
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "classic",
|
||||
"max_token_length": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "Foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "<APOSTROPHE>",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ]
|
||||
---------------------------
|
||||
|
||||
The following are settings that can be set for a `classic` tokenizer
|
||||
type:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`max_token_length` |The maximum token length. If a token is seen that
|
||||
exceeds this length then it is discarded. Defaults to `255`.
|
||||
|=======================================================================
|
||||
|
||||
|
|
|
@ -1,80 +1,323 @@
|
|||
[[analysis-edgengram-tokenizer]]
|
||||
=== Edge NGram Tokenizer
|
||||
|
||||
A tokenizer of type `edgeNGram`.
|
||||
The `edge_ngram` tokenizer first breaks text down into words whenever it
|
||||
encounters one of a list of specified characters, then it emits
|
||||
https://en.wikipedia.org/wiki/N-gram[N-grams] of each word where the start of
|
||||
the N-gram is anchored to the beginning of the word.
|
||||
|
||||
This tokenizer is very similar to `nGram` but only keeps n-grams which
|
||||
start at the beginning of a token.
|
||||
Edge N-Grams are useful for _search-as-you-type_ queries.
|
||||
|
||||
The following are settings that can be set for a `edgeNGram` tokenizer
|
||||
type:
|
||||
TIP: When you need _search-as-you-type_ for text which has a widely known
|
||||
order, such as movie or song titles, the
|
||||
<<search-suggesters-completion,completion suggester>> is a much more efficient
|
||||
choice than edge N-grams. Edge N-grams have the advantage when trying to
|
||||
autocomplete words that can appear in any order.
|
||||
|
||||
[cols="<,<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description |Default value
|
||||
|`min_gram` |Minimum size in codepoints of a single n-gram |`1`.
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
|`max_gram` |Maximum size in codepoints of a single n-gram |`2`.
|
||||
With the default settings, the `edge_ngram` tokenizer treats the initial text as a
|
||||
single token and produces N-grams with minimum length `1` and maximum length
|
||||
`2`:
|
||||
|
||||
|`token_chars` | Characters classes to keep in the
|
||||
tokens, Elasticsearch will split on characters that don't belong to any
|
||||
of these classes. |`[]` (Keep all characters)
|
||||
|=======================================================================
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "edge_ngram",
|
||||
"text": "Quick Fox"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "Q",
|
||||
"start_offset": 0,
|
||||
"end_offset": 1,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "Qu",
|
||||
"start_offset": 0,
|
||||
"end_offset": 2,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
`token_chars` accepts the following character classes:
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ Q, Qu ]
|
||||
---------------------------
|
||||
|
||||
NOTE: These default gram lengths are almost entirely useless. You need to
|
||||
configure the `edge_ngram` before using it.
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `edge_ngram` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`letter`:: for example `a`, `b`, `ï` or `京`
|
||||
`digit`:: for example `3` or `7`
|
||||
`whitespace`:: for example `" "` or `"\n"`
|
||||
`punctuation`:: for example `!` or `"`
|
||||
`symbol`:: for example `$` or `√`
|
||||
`min_gram`::
|
||||
Minimum length of characters in a gram. Defaults to `1`.
|
||||
|
||||
`max_gram`::
|
||||
Maximum length of characters in a gram. Defaults to `2`.
|
||||
|
||||
`token_chars`::
|
||||
|
||||
Character classes that should be included in a token. Elasticsearch
|
||||
will split on characters that don't belong to the classes specified.
|
||||
Defaults to `[]` (keep all characters).
|
||||
+
|
||||
Character classes may be any of the following:
|
||||
+
|
||||
* `letter` -- for example `a`, `b`, `ï` or `京`
|
||||
* `digit` -- for example `3` or `7`
|
||||
* `whitespace` -- for example `" "` or `"\n"`
|
||||
* `punctuation` -- for example `!` or `"`
|
||||
* `symbol` -- for example `$` or `√`
|
||||
|
||||
[float]
|
||||
==== Example
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `edge_ngram` tokenizer to treat letters and
|
||||
digits as tokens, and to produce grams with minimum length `2` and maximum
|
||||
length `10`:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/test' -d '
|
||||
{
|
||||
"settings" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"my_edge_ngram_analyzer" : {
|
||||
"tokenizer" : "my_edge_ngram_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer" : {
|
||||
"my_edge_ngram_tokenizer" : {
|
||||
"type" : "edgeNGram",
|
||||
"min_gram" : "2",
|
||||
"max_gram" : "5",
|
||||
"token_chars": [ "letter", "digit" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
}'
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "edge_ngram",
|
||||
"min_gram": 2,
|
||||
"max_gram": 10,
|
||||
"token_chars": [
|
||||
"letter",
|
||||
"digit"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_edge_ngram_analyzer' -d 'FC Schalke 04'
|
||||
# FC, Sc, Sch, Scha, Schal, 04
|
||||
--------------------------------------------------
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
[float]
|
||||
==== `side` deprecated
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "2 Quick Foxes."
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
There used to be a `side` parameter up to `0.90.1` but it is now deprecated. In
|
||||
order to emulate the behavior of `"side" : "BACK"` a
|
||||
<<analysis-reverse-tokenfilter,`reverse` token filter>> should be used together
|
||||
with the <<analysis-edgengram-tokenfilter,`edgeNGram` token filter>>. The
|
||||
`edgeNGram` filter must be enclosed in `reverse` filters like this:
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"filter" : ["reverse", "edgeNGram", "reverse"]
|
||||
--------------------------------------------------
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "Qu",
|
||||
"start_offset": 2,
|
||||
"end_offset": 4,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "Qui",
|
||||
"start_offset": 2,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "Quic",
|
||||
"start_offset": 2,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Quick",
|
||||
"start_offset": 2,
|
||||
"end_offset": 7,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "Fo",
|
||||
"start_offset": 8,
|
||||
"end_offset": 10,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "Fox",
|
||||
"start_offset": 8,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "Foxe",
|
||||
"start_offset": 8,
|
||||
"end_offset": 12,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "Foxes",
|
||||
"start_offset": 8,
|
||||
"end_offset": 13,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ Qu, Qui, Quic, Quick, Fo, Fox, Foxe, Foxes ]
|
||||
---------------------------
|
||||
|
||||
Usually we recommend using the same `analyzer` at index time and at search
|
||||
time. In the case of the `edge_ngram` tokenizer, the advice is different. It
|
||||
only makes sense to use the `edge_ngram` tokenizer at index time, to ensure
|
||||
that partial words are available for matching in the index. At search time,
|
||||
just search for the terms the user has typed in, for instance: `Quick Fo`.
|
||||
|
||||
Below is an example of how to set up a field for _search-as-you-type_:
|
||||
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"autocomplete": {
|
||||
"tokenizer": "autocomplete",
|
||||
"filter": [
|
||||
"lowercase"
|
||||
]
|
||||
},
|
||||
"autocomplete_search": {
|
||||
"tokenizer": "lowercase"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"autocomplete": {
|
||||
"type": "edge_ngram",
|
||||
"min_gram": 2,
|
||||
"max_gram": 10,
|
||||
"token_chars": [
|
||||
"letter"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"doc": {
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "text",
|
||||
"analyzer": "autocomplete",
|
||||
"search_analyzer": "autocomplete_search"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUT my_index/doc/1
|
||||
{
|
||||
"title": "Quick Foxes" <1>
|
||||
}
|
||||
|
||||
POST my_index/_refresh
|
||||
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"match": {
|
||||
"title": {
|
||||
"query": "Quick Fo", <2>
|
||||
"operator": "and"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
-----------------------------------
|
||||
// CONSOLE
|
||||
|
||||
<1> The `autocomplete` analyzer indexes the terms `[qu, qui, quic, quick, fo, fox, foxe, foxes]`.
|
||||
<2> The `autocomplete_search` analyzer searches for the terms `[quick, fo]`, both of which appear in the index.
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"took": $body.took,
|
||||
"timed_out": false,
|
||||
"_shards": {
|
||||
"total": 5,
|
||||
"successful": 5,
|
||||
"failed": 0
|
||||
},
|
||||
"hits": {
|
||||
"total": 1,
|
||||
"max_score": 0.44194174,
|
||||
"hits": [
|
||||
{
|
||||
"_index": "my_index",
|
||||
"_type": "doc",
|
||||
"_id": "1",
|
||||
"_score": 0.44194174,
|
||||
"_source": {
|
||||
"title": "Quick Foxes"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE[s/"took".*/"took": "$body.took",/]
|
||||
/////////////////////
|
||||
|
||||
which essentially reverses the token, builds front `EdgeNGrams` and reverses
|
||||
the ngram again. This has the same effect as the previous `"side" : "BACK"` setting.
|
||||
|
||||
|
|
|
@ -1,15 +1,60 @@
|
|||
[[analysis-keyword-tokenizer]]
|
||||
=== Keyword Tokenizer
|
||||
|
||||
A tokenizer of type `keyword` that emits the entire input as a single
|
||||
output.
|
||||
The `keyword` tokenizer is a ``noop'' tokenizer that accepts whatever text it
|
||||
is given and outputs the exact same text as a single term. It can be combined
|
||||
with token filters to normalise output, e.g. lower-casing email addresses.
|
||||
|
||||
The following are settings that can be set for a `keyword` tokenizer
|
||||
type:
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================
|
||||
|Setting |Description
|
||||
|`buffer_size` |The term buffer size. Defaults to `256`.
|
||||
|=======================================================
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "keyword",
|
||||
"text": "New York"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "New York",
|
||||
"start_offset": 0,
|
||||
"end_offset": 8,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following term:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ New York ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `keyword` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`buffer_size`::
|
||||
|
||||
The number of characters read into the term buffer in a single pass.
|
||||
Defaults to `256`. The term buffer will grow by this size until all the
|
||||
text has been consumed. It is advisable not to change this setting.
|
||||
|
||||
|
|
|
@ -1,7 +1,123 @@
|
|||
[[analysis-letter-tokenizer]]
|
||||
=== Letter Tokenizer
|
||||
|
||||
A tokenizer of type `letter` that divides text at non-letters. That's to
|
||||
say, it defines tokens as maximal strings of adjacent letters. Note,
|
||||
this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces.
|
||||
The `letter` tokenizer breaks text into terms whenever it encounters a
|
||||
character which is not a letter. It does a reasonable job for most European
|
||||
languages, but does a terrible job for some Asian languages, where words are
|
||||
not separated by spaces.
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "letter",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "Brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog",
|
||||
"start_offset": 45,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 49,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, QUICK, Brown, Foxes, jumped, over, the, lazy, dog, s, bone ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `letter` tokenizer is not configurable.
|
||||
|
|
|
@ -1,15 +1,128 @@
|
|||
[[analysis-lowercase-tokenizer]]
|
||||
=== Lowercase Tokenizer
|
||||
|
||||
A tokenizer of type `lowercase` that performs the function of
|
||||
<<analysis-letter-tokenizer,Letter
|
||||
Tokenizer>> and
|
||||
<<analysis-lowercase-tokenfilter,Lower
|
||||
Case Token Filter>> together. It divides text at non-letters and converts
|
||||
them to lower case. While it is functionally equivalent to the
|
||||
combination of
|
||||
<<analysis-letter-tokenizer,Letter
|
||||
Tokenizer>> and
|
||||
<<analysis-lowercase-tokenfilter,Lower
|
||||
Case Token Filter>>, there is a performance advantage to doing the two
|
||||
tasks at once, hence this (redundant) implementation.
|
||||
|
||||
The `lowercase` toknenizer, like the
|
||||
<<analysis-letter-tokenizer, `letter` tokenizer>> breaks text into terms
|
||||
whenever it encounters a character which is not a letter, but it also
|
||||
lowecases all terms. It is functionally equivalent to the
|
||||
<<analysis-letter-tokenizer, `letter` tokenizer>> combined with the
|
||||
<<analysis-lowercase-tokenfilter, `lowercase` token filter>>, but is more
|
||||
efficient as it performs both steps in a single pass.
|
||||
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "lowercase",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "quick",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog",
|
||||
"start_offset": 45,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 49,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ the, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `lowercase` tokenizer is not configurable.
|
||||
|
|
|
@ -1,57 +1,306 @@
|
|||
[[analysis-ngram-tokenizer]]
|
||||
=== NGram Tokenizer
|
||||
|
||||
A tokenizer of type `nGram`.
|
||||
The `ngram` tokenizer first breaks text down into words whenever it encounters
|
||||
one of a list of specified characters, then it emits
|
||||
https://en.wikipedia.org/wiki/N-gram[N-grams] of each word of the specified
|
||||
length.
|
||||
|
||||
The following are settings that can be set for a `nGram` tokenizer type:
|
||||
|
||||
[cols="<,<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description |Default value
|
||||
|`min_gram` |Minimum size in codepoints of a single n-gram |`1`.
|
||||
|
||||
|`max_gram` |Maximum size in codepoints of a single n-gram |`2`.
|
||||
|
||||
|`token_chars` |Characters classes to keep in the
|
||||
tokens, Elasticsearch will split on characters that don't belong to any
|
||||
of these classes. |`[]` (Keep all characters)
|
||||
|=======================================================================
|
||||
|
||||
`token_chars` accepts the following character classes:
|
||||
|
||||
[horizontal]
|
||||
`letter`:: for example `a`, `b`, `ï` or `京`
|
||||
`digit`:: for example `3` or `7`
|
||||
`whitespace`:: for example `" "` or `"\n"`
|
||||
`punctuation`:: for example `!` or `"`
|
||||
`symbol`:: for example `$` or `√`
|
||||
N-grams are like a sliding window that moves across the word - a continuous
|
||||
sequence of characters of the specified length. They are useful for querying
|
||||
languages that don't use spaces or that have long compound words, like German.
|
||||
|
||||
[float]
|
||||
==== Example
|
||||
=== Example output
|
||||
|
||||
With the default settings, the `ngram` tokenizer treats the initial text as a
|
||||
single token and produces N-grams with minimum length `1` and maximum length
|
||||
`2`:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/test' -d '
|
||||
{
|
||||
"settings" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"my_ngram_analyzer" : {
|
||||
"tokenizer" : "my_ngram_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer" : {
|
||||
"my_ngram_tokenizer" : {
|
||||
"type" : "nGram",
|
||||
"min_gram" : "2",
|
||||
"max_gram" : "3",
|
||||
"token_chars": [ "letter", "digit" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "ngram",
|
||||
"text": "Quick Fox"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "Q",
|
||||
"start_offset": 0,
|
||||
"end_offset": 1,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "Qu",
|
||||
"start_offset": 0,
|
||||
"end_offset": 2,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "u",
|
||||
"start_offset": 1,
|
||||
"end_offset": 2,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "ui",
|
||||
"start_offset": 1,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "i",
|
||||
"start_offset": 2,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "ic",
|
||||
"start_offset": 2,
|
||||
"end_offset": 4,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "c",
|
||||
"start_offset": 3,
|
||||
"end_offset": 4,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "ck",
|
||||
"start_offset": 3,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "k",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "k ",
|
||||
"start_offset": 4,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": " ",
|
||||
"start_offset": 5,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 10
|
||||
},
|
||||
{
|
||||
"token": " F",
|
||||
"start_offset": 5,
|
||||
"end_offset": 7,
|
||||
"type": "word",
|
||||
"position": 11
|
||||
},
|
||||
{
|
||||
"token": "F",
|
||||
"start_offset": 6,
|
||||
"end_offset": 7,
|
||||
"type": "word",
|
||||
"position": 12
|
||||
},
|
||||
{
|
||||
"token": "Fo",
|
||||
"start_offset": 6,
|
||||
"end_offset": 8,
|
||||
"type": "word",
|
||||
"position": 13
|
||||
},
|
||||
{
|
||||
"token": "o",
|
||||
"start_offset": 7,
|
||||
"end_offset": 8,
|
||||
"type": "word",
|
||||
"position": 14
|
||||
},
|
||||
{
|
||||
"token": "ox",
|
||||
"start_offset": 7,
|
||||
"end_offset": 9,
|
||||
"type": "word",
|
||||
"position": 15
|
||||
},
|
||||
{
|
||||
"token": "x",
|
||||
"start_offset": 8,
|
||||
"end_offset": 9,
|
||||
"type": "word",
|
||||
"position": 16
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ Q, Qu, u, ui, i, ic, c, ck, k, "k ", " ", " F", F, Fo, o, ox, x ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `ngram` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`min_gram`::
|
||||
Minimum length of characters in a gram. Defaults to `1`.
|
||||
|
||||
`max_gram`::
|
||||
Maximum length of characters in a gram. Defaults to `2`.
|
||||
|
||||
`token_chars`::
|
||||
|
||||
Character classes that should be included in a token. Elasticsearch
|
||||
will split on characters that don't belong to the classes specified.
|
||||
Defaults to `[]` (keep all characters).
|
||||
+
|
||||
Character classes may be any of the following:
|
||||
+
|
||||
* `letter` -- for example `a`, `b`, `ï` or `京`
|
||||
* `digit` -- for example `3` or `7`
|
||||
* `whitespace` -- for example `" "` or `"\n"`
|
||||
* `punctuation` -- for example `!` or `"`
|
||||
* `symbol` -- for example `$` or `√`
|
||||
|
||||
TIP: It usually makes sense to set `min_gram` and `max_gram` to the same
|
||||
value. The smaller the length, the more documents will match but the lower
|
||||
the quality of the matches. The longer the length, the more specific the
|
||||
matches. A tri-gram (length `3`) is a good place to start.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `ngram` tokenizer to treat letters and
|
||||
digits as tokens, and to produce tri-grams (grams of length `3`):
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "ngram",
|
||||
"min_gram": 3,
|
||||
"max_gram": 3,
|
||||
"token_chars": [
|
||||
"letter",
|
||||
"digit"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "2 Quick Foxes."
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "Qui",
|
||||
"start_offset": 2,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "uic",
|
||||
"start_offset": 3,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "ick",
|
||||
"start_offset": 4,
|
||||
"end_offset": 7,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Fox",
|
||||
"start_offset": 8,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "oxe",
|
||||
"start_offset": 9,
|
||||
"end_offset": 12,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "xes",
|
||||
"start_offset": 10,
|
||||
"end_offset": 13,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ Qui, uic, ick, Fox, oxe, xes ]
|
||||
---------------------------
|
||||
|
||||
|
||||
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_ngram_analyzer' -d 'FC Schalke 04'
|
||||
# FC, Sc, Sch, ch, cha, ha, hal, al, alk, lk, lke, ke, 04
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -1,32 +1,175 @@
|
|||
[[analysis-pathhierarchy-tokenizer]]
|
||||
=== Path Hierarchy Tokenizer
|
||||
|
||||
The `path_hierarchy` tokenizer takes something like this:
|
||||
The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
|
||||
path, splits on the path separator, and emits a term for each component in the
|
||||
tree.
|
||||
|
||||
-------------------------
|
||||
/something/something/else
|
||||
-------------------------
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
And produces tokens:
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "path_hierarchy",
|
||||
"text": "/one/two/three"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
-------------------------
|
||||
/something
|
||||
/something/something
|
||||
/something/something/else
|
||||
-------------------------
|
||||
/////////////////////
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`delimiter` |The character delimiter to use, defaults to `/`.
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "/one",
|
||||
"start_offset": 0,
|
||||
"end_offset": 4,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "/one/two",
|
||||
"start_offset": 0,
|
||||
"end_offset": 8,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "/one/two/three",
|
||||
"start_offset": 0,
|
||||
"end_offset": 14,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
|`replacement` |An optional replacement character to use. Defaults to
|
||||
the `delimiter`.
|
||||
/////////////////////
|
||||
|
||||
|`buffer_size` |The buffer size to use, defaults to `1024`.
|
||||
|
||||
|`reverse` |Generates tokens in reverse order, defaults to `false`.
|
||||
|
||||
|`skip` |Controls initial tokens to skip, defaults to `0`.
|
||||
|=======================================================================
|
||||
The above text would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ /one, /one/two, /one/two/three ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `path_hierarchy` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`delimiter`::
|
||||
The character to use as the path separator. Defaults to `/`.
|
||||
|
||||
`replacement`::
|
||||
An optional replacement character to use for the delimiter.
|
||||
Defaults to the `delimiter`.
|
||||
|
||||
`buffer_size`::
|
||||
The number of characters read into the term buffer in a single pass.
|
||||
Defaults to `1024`. The term buffer will grow by this size until all the
|
||||
text has been consumed. It is advisable not to change this setting.
|
||||
|
||||
`reverse`::
|
||||
If set to `true`, emits the tokens in reverse order. Defaults to `false`.
|
||||
|
||||
`skip`::
|
||||
The number of initial tokens to skip. Defaults to `0`.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `path_hierarchy` tokenizer to split on `-`
|
||||
characters, and to replace them with `/`. The first two tokens are skipped:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "path_hierarchy",
|
||||
"delimiter": "-",
|
||||
"replacement": "/",
|
||||
"skip": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "one-two-three-four-five"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "/three",
|
||||
"start_offset": 7,
|
||||
"end_offset": 13,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "/three/four",
|
||||
"start_offset": 7,
|
||||
"end_offset": 18,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "/three/four/five",
|
||||
"start_offset": 7,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ /three, /three/four, /three/four/five ]
|
||||
---------------------------
|
||||
|
||||
If we were to set `reverse` to `true`, it would produce the following:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ one/two/three/, two/three/, three/ ]
|
||||
---------------------------
|
||||
|
||||
|
|
|
@ -1,38 +1,268 @@
|
|||
[[analysis-pattern-tokenizer]]
|
||||
=== Pattern Tokenizer
|
||||
|
||||
A tokenizer of type `pattern` that can flexibly separate text into terms
|
||||
via a regular expression. Accepts the following settings:
|
||||
The `pattern` tokenizer uses a regular expression to either split text into
|
||||
terms whenever it matches a word separator, or to capture matching text as
|
||||
terms.
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|======================================================================
|
||||
|Setting |Description
|
||||
|`pattern` |The regular expression pattern, defaults to `\W+`.
|
||||
|`flags` |The regular expression flags.
|
||||
|`group` |Which group to extract into tokens. Defaults to `-1` (split).
|
||||
|======================================================================
|
||||
The default pattern is `\W+`, which splits text whenever it encounters
|
||||
non-word characters.
|
||||
|
||||
*IMPORTANT*: The regular expression should match the *token separators*,
|
||||
not the tokens themselves.
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
*********************************************
|
||||
Note that you may need to escape `pattern` string literal according to
|
||||
your client language rules. For example, in many programming languages
|
||||
a string literal for `\W+` pattern is written as `"\\W+"`.
|
||||
There is nothing special about `pattern` (you may have to escape other
|
||||
string literals as well); escaping `pattern` is common just because it
|
||||
often contains characters that should be escaped.
|
||||
*********************************************
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "pattern",
|
||||
"text": "The foo_bar_size's default is 5."
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
`group` set to `-1` (the default) is equivalent to "split". Using group
|
||||
>= 0 selects the matching group as the token. For example, if you have:
|
||||
/////////////////////
|
||||
|
||||
------------------------
|
||||
pattern = '([^']+)'
|
||||
group = 0
|
||||
input = aaa 'bbb' 'ccc'
|
||||
------------------------
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "foo_bar_size",
|
||||
"start_offset": 4,
|
||||
"end_offset": 16,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "s",
|
||||
"start_offset": 17,
|
||||
"end_offset": 18,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "default",
|
||||
"start_offset": 19,
|
||||
"end_offset": 26,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "is",
|
||||
"start_offset": 27,
|
||||
"end_offset": 29,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "5",
|
||||
"start_offset": 30,
|
||||
"end_offset": 31,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
the output will be two tokens: `'bbb'` and `'ccc'` (including the `'`
|
||||
marks). With the same input but using group=1, the output would be:
|
||||
`bbb` and `ccc` (no `'` marks).
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, foo_bar_size, s, default, is, 5 ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `pattern` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`pattern`::
|
||||
|
||||
A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression], defaults to `\W+`.
|
||||
|
||||
`flags`::
|
||||
|
||||
Java regular expression http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary[flags].
|
||||
lags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`.
|
||||
|
||||
`group`::
|
||||
|
||||
Which capture group to extract as tokens. Defaults to `-1` (split).
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `pattern` tokenizer to break text into
|
||||
tokens when it encounters commas:
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "pattern",
|
||||
"pattern": ","
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "comma,separated,values"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "comma",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "separated",
|
||||
"start_offset": 6,
|
||||
"end_offset": 15,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "values",
|
||||
"start_offset": 16,
|
||||
"end_offset": 22,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ comma, separated, values ]
|
||||
---------------------------
|
||||
|
||||
In the next example, we configure the `pattern` tokenizer to capture values
|
||||
enclosed in double quotes (ignoring embedded escaped quotes `\"`). The regex
|
||||
itself looks like this:
|
||||
|
||||
"((?:\\"|[^"]|\\")*)"
|
||||
|
||||
And reads as follows:
|
||||
|
||||
* A literal `"`
|
||||
* Start capturing:
|
||||
** A literal `\"` OR any character except `"`
|
||||
** Repeat until no more characters match
|
||||
* A literal closing `"`
|
||||
|
||||
When the pattern is specified in JSON, the `"` and `\` characters need to be
|
||||
escaped, so the pattern ends up looking like:
|
||||
|
||||
\"((?:\\\\\"|[^\"]|\\\\\")+)\"
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "pattern",
|
||||
"pattern": "\"((?:\\\\\"|[^\"]|\\\\\")+)\"",
|
||||
"group": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "\"value\", \"value with embedded \\\" quote\""
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "value",
|
||||
"start_offset": 1,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "value with embedded \\\" quote",
|
||||
"start_offset": 10,
|
||||
"end_offset": 38,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
The above example produces the following two terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ value, value with embedded \" quote ]
|
||||
---------------------------
|
||||
|
|
|
@ -1,18 +1,274 @@
|
|||
[[analysis-standard-tokenizer]]
|
||||
=== Standard Tokenizer
|
||||
|
||||
A tokenizer of type `standard` providing grammar based tokenizer that is
|
||||
a good tokenizer for most European language documents. The tokenizer
|
||||
implements the Unicode Text Segmentation algorithm, as specified in
|
||||
http://unicode.org/reports/tr29/[Unicode Standard Annex #29].
|
||||
The `standard` tokenizer provides grammar based tokenization (based on the
|
||||
Unicode Text Segmentation algorithm, as specified in
|
||||
http://unicode.org/reports/tr29/[Unicode Standard Annex #29]) and works well
|
||||
for most languages.
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "standard",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "<NUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "Foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `standard` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`max_token_length`::
|
||||
|
||||
The maximum token length. If a token is seen that exceeds this length then
|
||||
it is split at `max_token_length` intervals. Defaults to `255`.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `standard` tokenizer to have a
|
||||
`max_token_length` of 5 (for demonstration purposes):
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "standard",
|
||||
"max_token_length": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "<NUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Brown",
|
||||
"start_offset": 12,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "Foxes",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "jumpe",
|
||||
"start_offset": 24,
|
||||
"end_offset": 29,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "d",
|
||||
"start_offset": 29,
|
||||
"end_offset": 30,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 9
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 10
|
||||
},
|
||||
{
|
||||
"token": "bone",
|
||||
"start_offset": 51,
|
||||
"end_offset": 55,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 11
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ]
|
||||
---------------------------
|
||||
|
||||
The following are settings that can be set for a `standard` tokenizer
|
||||
type:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`max_token_length` |The maximum token length. If a token is seen that
|
||||
exceeds this length then it is split at `max_token_length` intervals. Defaults to `255`.
|
||||
|=======================================================================
|
||||
|
||||
|
|
|
@ -1,7 +1,106 @@
|
|||
[[analysis-thai-tokenizer]]
|
||||
=== Thai Tokenizer
|
||||
|
||||
A tokenizer of type `thai` that segments Thai text into words. This tokenizer
|
||||
uses the built-in Thai segmentation algorithm included with Java to divide
|
||||
up Thai text. Text in other languages in general will be treated the same
|
||||
as `standard`.
|
||||
The `thai` tokenizer segments Thai text into words, using the Thai
|
||||
segmentation algorithm included with Java. Text in other languages in general
|
||||
will be treated the same as the
|
||||
<<analysis-standard-tokenizer,`standard` tokenizer>>.
|
||||
|
||||
WARNING: This tokenizer may not be supported by all JREs. It is known to work
|
||||
with Sun/Oracle and OpenJDK. If your application needs to be fully portable,
|
||||
consider using the {plugins}/analysis-icu-tokenizer.html[ICU Tokenizer] instead.
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "thai",
|
||||
"text": "การที่ได้ต้องแสดงว่างานดี"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "การ",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "ที่",
|
||||
"start_offset": 3,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "ได้",
|
||||
"start_offset": 6,
|
||||
"end_offset": 9,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "ต้อง",
|
||||
"start_offset": 9,
|
||||
"end_offset": 13,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "แสดง",
|
||||
"start_offset": 13,
|
||||
"end_offset": 17,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "ว่า",
|
||||
"start_offset": 17,
|
||||
"end_offset": 20,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "งาน",
|
||||
"start_offset": 20,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "ดี",
|
||||
"start_offset": 23,
|
||||
"end_offset": 25,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ การ, ที่, ได้, ต้อง, แสดง, ว่า, งาน, ดี ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `thai` tokenizer is not configurable.
|
||||
|
|
|
@ -1,16 +1,199 @@
|
|||
[[analysis-uaxurlemail-tokenizer]]
|
||||
=== UAX Email URL Tokenizer
|
||||
=== UAX URL Email Tokenizer
|
||||
|
||||
A tokenizer of type `uax_url_email` which works exactly like the
|
||||
`standard` tokenizer, but tokenizes emails and urls as single tokens.
|
||||
The `uax_url_email` tokenizer is like the <<analysis-standard-tokenizer,`standard` tokenizer>> except that it
|
||||
recognises URLs and email addresses as single tokens.
|
||||
|
||||
The following are settings that can be set for a `uax_url_email`
|
||||
tokenizer type:
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`max_token_length` |The maximum token length. If a token is seen that
|
||||
exceeds this length then it is discarded. Defaults to `255`.
|
||||
|=======================================================================
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "uax_url_email",
|
||||
"text": "Email me at john.smith@global-international.com"
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "Email",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "me",
|
||||
"start_offset": 6,
|
||||
"end_offset": 8,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "at",
|
||||
"start_offset": 9,
|
||||
"end_offset": 11,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "john.smith@global-international.com",
|
||||
"start_offset": 12,
|
||||
"end_offset": 47,
|
||||
"type": "<EMAIL>",
|
||||
"position": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ Email, me, at, john.smith@global-international.com ]
|
||||
---------------------------
|
||||
|
||||
while the `standard` tokenizer would produce:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ Email, me, at, john.smith, global, international.com ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `uax_url_email` tokenizer accepts the following parameters:
|
||||
|
||||
[horizontal]
|
||||
`max_token_length`::
|
||||
|
||||
The maximum token length. If a token is seen that exceeds this length then
|
||||
it is split at `max_token_length` intervals. Defaults to `255`.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
In this example, we configure the `uax_url_email` tokenizer to have a
|
||||
`max_token_length` of 5 (for demonstration purposes):
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "my_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_tokenizer": {
|
||||
"type": "uax_url_email",
|
||||
"max_token_length": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET _cluster/health?wait_for_status=yellow
|
||||
|
||||
POST my_index/_analyze
|
||||
{
|
||||
"analyzer": "my_analyzer",
|
||||
"text": "john.smith@global-international.com"
|
||||
}
|
||||
----------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "john",
|
||||
"start_offset": 0,
|
||||
"end_offset": 4,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "smith",
|
||||
"start_offset": 5,
|
||||
"end_offset": 10,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "globa",
|
||||
"start_offset": 11,
|
||||
"end_offset": 16,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "l",
|
||||
"start_offset": 16,
|
||||
"end_offset": 17,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "inter",
|
||||
"start_offset": 18,
|
||||
"end_offset": 23,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "natio",
|
||||
"start_offset": 23,
|
||||
"end_offset": 28,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "nal.c",
|
||||
"start_offset": 28,
|
||||
"end_offset": 33,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "om",
|
||||
"start_offset": 33,
|
||||
"end_offset": 35,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 7
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above example produces the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ john, smith, globa, l, inter, natio, nal.c, om ]
|
||||
---------------------------
|
||||
|
|
|
@ -1,4 +1,114 @@
|
|||
[[analysis-whitespace-tokenizer]]
|
||||
=== Whitespace Tokenizer
|
||||
=== Whitespace Analyzer
|
||||
|
||||
A tokenizer of type `whitespace` that divides text at whitespace.
|
||||
The `whitespace` tokenizer breaks text into terms whenever it encounters a
|
||||
whitespace character.
|
||||
|
||||
[float]
|
||||
=== Example output
|
||||
|
||||
[source,js]
|
||||
---------------------------
|
||||
POST _analyze
|
||||
{
|
||||
"tokenizer": "whitespace",
|
||||
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
|
||||
}
|
||||
---------------------------
|
||||
// CONSOLE
|
||||
|
||||
/////////////////////
|
||||
|
||||
[source,js]
|
||||
----------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "The",
|
||||
"start_offset": 0,
|
||||
"end_offset": 3,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "2",
|
||||
"start_offset": 4,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "QUICK",
|
||||
"start_offset": 6,
|
||||
"end_offset": 11,
|
||||
"type": "word",
|
||||
"position": 2
|
||||
},
|
||||
{
|
||||
"token": "Brown-Foxes",
|
||||
"start_offset": 12,
|
||||
"end_offset": 23,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "jumped",
|
||||
"start_offset": 24,
|
||||
"end_offset": 30,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "over",
|
||||
"start_offset": 31,
|
||||
"end_offset": 35,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "the",
|
||||
"start_offset": 36,
|
||||
"end_offset": 39,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "lazy",
|
||||
"start_offset": 40,
|
||||
"end_offset": 44,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "dog's",
|
||||
"start_offset": 45,
|
||||
"end_offset": 50,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "bone.",
|
||||
"start_offset": 51,
|
||||
"end_offset": 56,
|
||||
"type": "word",
|
||||
"position": 9
|
||||
}
|
||||
]
|
||||
}
|
||||
----------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
/////////////////////
|
||||
|
||||
|
||||
The above sentence would produce the following terms:
|
||||
|
||||
[source,text]
|
||||
---------------------------
|
||||
[ The, 2, QUICK, Brown-Foxes, jumped, over, the, lazy, dog's, bone. ]
|
||||
---------------------------
|
||||
|
||||
[float]
|
||||
=== Configuration
|
||||
|
||||
The `whitespace` tokenizer is not configurable.
|
||||
|
|
Loading…
Reference in New Issue