mirror of https://github.com/apache/lucene.git
SOLR-3056: add example japanese field type, lazy-load kuromoji resources
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242573 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9f783ead67
commit
8c2e20bb2d
|
@ -485,6 +485,7 @@ New Features
|
||||||
Uwe Schindler)
|
Uwe Schindler)
|
||||||
|
|
||||||
* LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
|
* LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
|
||||||
|
See the 'text_ja' fieldtype in the example to get started.
|
||||||
(Christian Moen, Masaru Hasegawa via Robert Muir)
|
(Christian Moen, Masaru Hasegawa via Robert Muir)
|
||||||
|
|
||||||
* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and
|
* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and
|
||||||
|
|
|
@ -59,11 +59,12 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
|
||||||
|
|
||||||
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
|
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
|
||||||
|
|
||||||
private Segmenter segmenter;
|
private UserDictionary userDictionary;
|
||||||
|
private Mode mode;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
Mode mode = getMode(args);
|
mode = getMode(args);
|
||||||
String userDictionaryPath = args.get(USER_DICT_PATH);
|
String userDictionaryPath = args.get(USER_DICT_PATH);
|
||||||
try {
|
try {
|
||||||
if (userDictionaryPath != null) {
|
if (userDictionaryPath != null) {
|
||||||
|
@ -76,9 +77,9 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
.onMalformedInput(CodingErrorAction.REPORT)
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||||
Reader reader = new InputStreamReader(stream, decoder);
|
Reader reader = new InputStreamReader(stream, decoder);
|
||||||
this.segmenter = new Segmenter(new UserDictionary(reader), mode);
|
userDictionary = new UserDictionary(reader);
|
||||||
} else {
|
} else {
|
||||||
this.segmenter = new Segmenter(mode);
|
userDictionary = null;
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||||
|
@ -87,7 +88,7 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create(Reader input) {
|
public Tokenizer create(Reader input) {
|
||||||
return new KuromojiTokenizer(segmenter, input);
|
return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Mode getMode(Map<String, String> args) {
|
private Mode getMode(Map<String, String> args) {
|
||||||
|
|
|
@ -491,7 +491,7 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<!-- CJK bigram (see text_ja for an alternative Japanese configuration) -->
|
<!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) -->
|
||||||
<fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
@ -676,6 +676,44 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
<!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
|
||||||
|
|
||||||
|
NOTE: If you want to optimize search for precision, use default operator AND in your query
|
||||||
|
parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use
|
||||||
|
OR if you would like to optimize for recall (default).
|
||||||
|
-->
|
||||||
|
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
||||||
|
<analyzer>
|
||||||
|
<!-- Kuromoji Japanese morphological analyzer/tokenizer.
|
||||||
|
|
||||||
|
Uses a search-mode (heuristic) to get a noun-decompounding effect that is useful for search.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
関西国際空港 (Kansai International Airpart) becomes 関西 (Kansai) 国際 (International) 空港 (airport)
|
||||||
|
so we get a match when searching for 空港 (airport) as we would expect from a good search engine.
|
||||||
|
(With regular segmentation 関西国際空港 becomes one word and we don't get a hit.)
|
||||||
|
|
||||||
|
Valid values for mode are:
|
||||||
|
normal: regular segmentation
|
||||||
|
search: segmentation useful for search with extra splitting (default)
|
||||||
|
extended: same as search mode, but unigrams unknown words (experimental)
|
||||||
|
|
||||||
|
NOTE: Search-mode improves segmentation for search at the expense of part-of-speech and reading accuracy
|
||||||
|
-->
|
||||||
|
<tokenizer class="solr.KuromojiTokenizerFactory" mode="search"/>
|
||||||
|
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
||||||
|
<filter class="solr.KuromojiBaseFormFilterFactory"/>
|
||||||
|
<!-- Removes tokens with certain part-of-speech tags -->
|
||||||
|
<filter class="solr.KuromojiPartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> -->
|
||||||
|
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
|
||||||
|
<filter class="solr.CJKWidthFilterFactory"/>
|
||||||
|
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
|
||||||
|
<!-- Lower-case romaji characters -->
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
<!-- Latvian -->
|
<!-- Latvian -->
|
||||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer>
|
<analyzer>
|
||||||
|
|
Loading…
Reference in New Issue