From b23aab5482f109d6c70470e1902d9e61474aeb1c Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Fri, 26 May 2017 14:47:24 -0400 Subject: [PATCH] SOLR-10758: Modernize the Solr ref guide's Chinese language analysis coverage --- .../icu/segmentation/TestICUTokenizerCJK.java | 9 +- solr/CHANGES.txt | 4 + .../solr-ref-guide/src/language-analysis.adoc | 140 ++++++++++++------ solr/solr-ref-guide/src/tokenizers.adoc | 2 +- 4 files changed, 111 insertions(+), 44 deletions(-) diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java index 96f44d686b0..75481f1924c 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java @@ -53,7 +53,14 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase { new String[] { "我", "购买", "了", "道具", "和", "服装" } ); } - + + public void testTraditionalChinese() throws Exception { + assertAnalyzesTo(a, "我購買了道具和服裝。", + new String[] { "我", "購買", "了", "道具", "和", "服裝"}); + assertAnalyzesTo(a, "定義切分字串的基本單位是訂定分詞標準的首要工作", // From http://godel.iis.sinica.edu.tw/CKIP/paper/wordsegment_standard.pdf + new String[] { "定義", "切", "分", "字串", "的", "基本", "單位", "是", "訂定", "分詞", "標準", "的", "首要", "工作" }); + } + public void testChineseNumerics() throws Exception { assertAnalyzesTo(a, "9483", new String[] { "9483" }); assertAnalyzesTo(a, "院內分機9483。", diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d2f42e65c15..cd383d10bf3 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -247,6 +247,10 @@ Optimizations so that the second phase which would normally involve calculating the domain for the bucket can be skipped entirely, leading to large performance improvements. (yonik) +Ref Guide +---------------------- + +* SOLR-10758: Modernize the Solr ref guide's Chinese language analysis coverage. (Steve Rowe) Other Changes ---------------------- diff --git a/solr/solr-ref-guide/src/language-analysis.adoc b/solr/solr-ref-guide/src/language-analysis.adoc index 0cf8e13ac06..c55a0cdfadb 100644 --- a/solr/solr-ref-guide/src/language-analysis.adoc +++ b/solr/solr-ref-guide/src/language-analysis.adoc @@ -378,9 +378,8 @@ These factories are each designed to work with specific languages. The languages * <> * <> * <> -* <> +* <> * <> -* <> * <> * <> @@ -508,15 +507,100 @@ Solr can stem Catalan using the Snowball Porter Stemmer with an argument of `lan *Out:* "llengu"(1), "llengu"(2) -[[LanguageAnalysis-Chinese]] -=== Chinese +[[LanguageAnalysis-TraditionalChinese]] +=== Traditional Chinese -<> is suitable for Traditional Chinese text. Following the Word Break rules from the Unicode Text Segmentation algorithm, it produces one token per Chinese character. +The default configuration of the <> is suitable for Traditional Chinese text. It follows the Word Break rules from the Unicode Text Segmentation algorithm for non-Chinese text, and uses a dictionary to segment Chinese words. To use this tokenizer, see `solr/contrib/analysis-extras/README.txt` for instructions on which jars you need to add to your `solr_home/lib`. + +<> can also be used to tokenize Traditional Chinese text. Following the Word Break rules from the Unicode Text Segmentation algorithm, it produces one token per Chinese character. When combined with <>, overlapping bigrams of Chinese characters are formed. + +<> folds fullwidth ASCII variants into the equivalent Basic Latin forms. + +*Examples:* + +[source,xml] +---- + + + + + +---- + +[source,xml] +---- + + + + + + +---- + +[[LanguageAnalysis-CJKBigramFilter]] +=== CJK Bigram Filter + +Forms bigrams (overlapping 2-character sequences) of CJK characters that are generated from <> or <>. + +By default, all CJK characters produce bigrams, but finer grained control is available by specifying orthographic type arguments `han`, `hiragana`, `katakana`, and `hangul`. When set to `false`, characters of the corresponding type will be passed through as unigrams, and will not be included in any bigrams. + +When a CJK character has no adjacent characters to form a bigram, it is output in unigram form. If you want to always output both unigrams and bigrams, set the `outputUnigrams` argument to `true`. + +In all cases, all non-CJK input is passed through unmodified. + +*Arguments:* + +`han`:: (true/false) If false, Han (Chinese) characters will not form bigrams. Default is true. + +`hiragana`:: (true/false) If false, Hiragana (Japanese) characters will not form bigrams. Default is true. + +`katakana`:: (true/false) If false, Katakana (Japanese) characters will not form bigrams. Default is true. + +`hangul`:: (true/false) If false, Hangul (Korean) characters will not form bigrams. Default is true. + +`outputUnigrams`:: (true/false) If true, in addition to forming bigrams, all characters are also passed through as unigrams. Default is false. + +See the example under <>. [[LanguageAnalysis-SimplifiedChinese]] === Simplified Chinese -For Simplified Chinese, Solr provides support for Chinese sentence and word segmentation with the `solr.HMMChineseTokenizerFactory` in the `analysis-extras` contrib module. This component includes a large dictionary and segments Chinese text into words with the Hidden Markov Model. To use this filter, see `solr/contrib/analysis-extras/README.txt` for instructions on which jars you need to add to your `solr_home/lib`. +For Simplified Chinese, Solr provides support for Chinese sentence and word segmentation with the <>. This component includes a large dictionary and segments Chinese text into words with the Hidden Markov Model. To use this tokenizer, you must add additional .jars to Solr's classpath (as described in the section <>). See the `solr/contrib/analysis-extras/README.txt` for information on which jars you need to add to your `SOLR_HOME/lib`. + +The default configuration of the <> is also suitable for Simplified Chinese text. It follows the Word Break rules from the Unicode Text Segmentation algorithm for non-Chinese text, and uses a dictionary to segment Chinese words. To use this tokenizer, you must add additional .jars to Solr's classpath (as described in the section <>). See the `solr/contrib/analysis-extras/README.txt` for information on which jars you need to add to your `SOLR_HOME/lib`. + +Also useful for Chinese analysis: + +<> folds fullwidth ASCII variants into the equivalent Basic Latin forms, and folds halfwidth Katakana variants into their equivalent fullwidth forms. + +*Examples:* + +[source,xml] +---- + + + + + + + +---- + +[source,xml] +---- + + + + + + +---- + +=== HMM Chinese Tokenizer + +For Simplified Chinese, Solr provides support for Chinese sentence and word segmentation with the `solr.HMMChineseTokenizerFactory` in the `analysis-extras` contrib module. This component includes a large dictionary and segments Chinese text into words with the Hidden Markov Model. To use this tokenizer, see `solr/contrib/analysis-extras/README.txt` for instructions on which jars you need to add to your `solr_home/lib`. *Factory class:* `solr.HMMChineseTokenizerFactory` @@ -528,35 +612,7 @@ To use the default setup with fallback to English Porter stemmer for English wor `` -Or to configure your own analysis setup, use the `solr.HMMChineseTokenizerFactory` along with your custom filter setup. - -[source,xml] ----- - - - - - ----- - -[[LanguageAnalysis-CJK]] -=== CJK - -This tokenizer breaks Chinese, Japanese and Korean language text into tokens. These are not whitespace delimited languages. The tokens generated by this tokenizer are "doubles", overlapping pairs of CJK characters found in the field text. - -*Factory class:* `solr.CJKTokenizerFactory` - -*Arguments:* None - -*Example:* - -[source,xml] ----- - - - ----- +Or to configure your own analysis setup, use the `solr.HMMChineseTokenizerFactory` along with your custom filter setup. See an example of this in the <> section. [[LanguageAnalysis-Czech]] === Czech @@ -947,15 +1003,15 @@ Solr can stem Irish using the Snowball Porter Stemmer with an argument of `langu Solr includes support for analyzing Japanese, via the Lucene Kuromoji morphological analyzer, which includes several analysis components - more details on each below: -* `JapaneseIterationMarkCharFilter` normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. -* `JapaneseTokenizer` tokenizes Japanese using morphological analysis, and annotates each term with part-of-speech, base form (a.k.a. lemma), reading and pronunciation. -* `JapaneseBaseFormFilter` replaces original terms with their base forms (a.k.a. lemmas). -* `JapanesePartOfSpeechStopFilter` removes terms that have one of the configured parts-of-speech. -* `JapaneseKatakanaStemFilter` normalizes common katakana spelling variations ending in a long sound character (U+30FC) by removing the long sound character. +* <> normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. +* <> tokenizes Japanese using morphological analysis, and annotates each term with part-of-speech, base form (a.k.a. lemma), reading and pronunciation. +* <> replaces original terms with their base forms (a.k.a. lemmas). +* <> removes terms that have one of the configured parts-of-speech. +* <> normalizes common katakana spelling variations ending in a long sound character (U+30FC) by removing the long sound character. Also useful for Japanese analysis, from lucene-analyzers-common: -* `CJKWidthFilter` folds fullwidth ASCII variants into the equivalent Basic Latin forms, and folds halfwidth Katakana variants into their equivalent fullwidth forms. +* <> folds fullwidth ASCII variants into the equivalent Basic Latin forms, and folds halfwidth Katakana variants into their equivalent fullwidth forms. [[LanguageAnalysis-JapaneseIterationMarkCharFilter]] ==== Japanese Iteration Mark CharFilter @@ -1022,7 +1078,7 @@ Removes terms with one of the configured parts-of-speech. `JapaneseTokenizer` an Normalizes common katakana spelling variations ending in a long sound character (U+30FC) by removing the long sound character. -`CJKWidthFilterFactory` should be specified prior to this filter to normalize half-width katakana to full-width. +<> should be specified prior to this filter to normalize half-width katakana to full-width. *Factory class:* `JapaneseKatakanaStemFilterFactory` diff --git a/solr/solr-ref-guide/src/tokenizers.adoc b/solr/solr-ref-guide/src/tokenizers.adoc index 5c7a819c6aa..7a8bdeb37f4 100644 --- a/solr/solr-ref-guide/src/tokenizers.adoc +++ b/solr/solr-ref-guide/src/tokenizers.adoc @@ -286,7 +286,7 @@ This tokenizer processes multilingual text and tokenizes it appropriately based You can customize this tokenizer's behavior by specifying http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules[per-script rule files]. To add per-script rules, add a `rulefiles` argument, which should contain a comma-separated list of `code:rulefile` pairs in the following format: four-letter ISO 15924 script code, followed by a colon, then a resource path. For example, to specify rules for Latin (script code "Latn") and Cyrillic (script code "Cyrl"), you would enter `Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi`. -The default `solr.ICUTokenizerFactory` provides UAX#29 word break rules tokenization (like `solr.StandardTokenizer`), but also includes custom tailorings for Hebrew (specializing handling of double and single quotation marks), and for syllable tokenization for Khmer, Lao, and Myanmar. +The default configuration for `solr.ICUTokenizerFactory` provides UAX#29 word break rules tokenization (like `solr.StandardTokenizer`), but also includes custom tailorings for Hebrew (specializing handling of double and single quotation marks), for syllable tokenization for Khmer, Lao, and Myanmar, and dictionary-based word segmentation for CJK characters. *Factory class:* `solr.ICUTokenizerFactory`