From 059577863e189cb366b66a31d8d4a7c3954dad08 Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Sat, 5 Oct 2013 16:29:40 +0200 Subject: [PATCH] Update README.md --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index d0b764f4d6a..c1abaf1c6ad 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,32 @@ Folding of unicode characters based on `UTR#30`. It registers itself under `icu_ } } +ICU Filtering +------------- + +The folding can be filtered by a set of unicode characters with the parameter `unicodeSetFilter`. This is useful for a non-internationalized search engine where retaining a set of national characters which are primary letters in a specific language is wanted. See syntax for the UnicodeSet "here":http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html. + +The Following example exempts Swedish characters from the folding. Note that the filtered characters are NOT lowercased which is why we add that filter below. + + { + "index" : { + "analysis" : { + "analyzer" : { + "folding" : { + "tokenizer" : "standard", + "filter" : ["my_icu_folding", "lowercase"] + } + } + "filter" : { + "my_icu_folding" : { + "type" : "icu_folding" + "unicodeSetFilter" : "[^åäöÅÄÖ]" + } + } + } + } + } + ICU Collation -------------