SOLR-12655: Add Korean morphological analyzer ("nori") to default distribution. This also adds examples for configuration in Solr's schema

This commit is contained in:
Uwe Schindler 2018-08-11 14:07:31 +02:00
parent f2c0005e9d
commit 928b92caa0
9 changed files with 220 additions and 2 deletions

View File

@ -155,6 +155,9 @@ New Features
* SOLR-12485: Uploading docs in XML now supports child documents as field values, thus providing a label to the
relationship instead of the current "anonymous" relationship. (Moshe Bla, David Smiley)
* SOLR-12655: Add Korean morphological analyzer ("nori") to default distribution. This also adds examples
for configuration in Solr's schema. (Uwe Schindler)
Bug Fixes
----------------------

View File

@ -94,6 +94,7 @@
-->
<pathelement location="${analyzers-common.jar}"/>
<pathelement location="${analyzers-kuromoji.jar}"/>
<pathelement location="${analyzers-nori.jar}"/>
<pathelement location="${analyzers-phonetic.jar}"/>
<pathelement location="${codecs.jar}"/>
<pathelement location="${backward-codecs.jar}"/>
@ -171,7 +172,7 @@
<target name="prep-lucene-jars"
depends="resolve-groovy,
jar-lucene-core, jar-backward-codecs, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-codecs,jar-expressions, jar-suggest, jar-highlighter, jar-memory,
jar-lucene-core, jar-backward-codecs, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-analyzers-nori, jar-codecs,jar-expressions, jar-suggest, jar-highlighter, jar-memory,
jar-misc, jar-spatial-extras, jar-spatial3d, jar-grouping, jar-queries, jar-queryparser, jar-join, jar-sandbox, jar-classification">
<property name="solr.deps.compiled" value="true"/>
</target>
@ -248,7 +249,7 @@
<property name="lucenedocs" location="${common.dir}/build/docs"/>
<!-- dependency to ensure all lucene javadocs are present -->
<target name="lucene-javadocs" depends="javadocs-lucene-core,javadocs-analyzers-common,javadocs-analyzers-icu,javadocs-analyzers-kuromoji,javadocs-analyzers-phonetic,javadocs-analyzers-smartcn,javadocs-analyzers-morfologik,javadocs-analyzers-stempel,javadocs-backward-codecs,javadocs-codecs,javadocs-expressions,javadocs-suggest,javadocs-grouping,javadocs-queries,javadocs-queryparser,javadocs-highlighter,javadocs-memory,javadocs-misc,javadocs-spatial-extras,javadocs-join,javadocs-test-framework"/>
<target name="lucene-javadocs" depends="javadocs-lucene-core,javadocs-analyzers-common,javadocs-analyzers-icu,javadocs-analyzers-kuromoji,javadocs-analyzers-nori,javadocs-analyzers-phonetic,javadocs-analyzers-smartcn,javadocs-analyzers-morfologik,javadocs-analyzers-stempel,javadocs-backward-codecs,javadocs-codecs,javadocs-expressions,javadocs-suggest,javadocs-grouping,javadocs-queries,javadocs-queryparser,javadocs-highlighter,javadocs-memory,javadocs-misc,javadocs-spatial-extras,javadocs-join,javadocs-test-framework"/>
<!-- create javadocs for the current module -->
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
@ -309,6 +310,7 @@
<link offline="true" href="${lucene.javadoc.url}analyzers-common" packagelistloc="${lucenedocs}/analyzers-common"/>
<link offline="true" href="${lucene.javadoc.url}analyzers-icu" packagelistloc="${lucenedocs}/analyzers-icu"/>
<link offline="true" href="${lucene.javadoc.url}analyzers-kuromoji" packagelistloc="${lucenedocs}/analyzers-kuromoji"/>
<link offline="true" href="${lucene.javadoc.url}analyzers-nori" packagelistloc="${lucenedocs}/analyzers-nori"/>
<link offline="true" href="${lucene.javadoc.url}analyzers-morfologik" packagelistloc="${lucenedocs}/analyzers-morfologik"/>
<link offline="true" href="${lucene.javadoc.url}analyzers-phonetic" packagelistloc="${lucenedocs}/analyzers-phonetic"/>
<link offline="true" href="${lucene.javadoc.url}analyzers-smartcn" packagelistloc="${lucenedocs}/analyzers-smartcn"/>

View File

@ -849,6 +849,40 @@
</analyzer>
</fieldType>
<!-- Korean morphological analysis -->
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
This dictionary was built with MeCab, it defines a format for the features adapted
for the Korean language.
Nori also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
The tokenizer supports multiple schema attributes:
* userDictionary: User dictionary path.
* userDictionaryEncoding: User dictionary encoding.
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
-->
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
listing the tags to remove. By default it removes:
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
This is basically an equivalent to stemming.
-->
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- Latvian -->
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">

View File

@ -996,6 +996,40 @@
</analyzer>
</fieldType>
<!-- Korean morphological analysis -->
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
This dictionary was built with MeCab, it defines a format for the features adapted
for the Korean language.
Nori also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
The tokenizer supports multiple schema attributes:
* userDictionary: User dictionary path.
* userDictionaryEncoding: User dictionary encoding.
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
-->
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
listing the tags to remove. By default it removes:
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
This is basically an equivalent to stemming.
-->
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- Latvian -->
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>

View File

@ -915,6 +915,40 @@
</analyzer>
</fieldType>
<!-- Korean morphological analysis -->
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
This dictionary was built with MeCab, it defines a format for the features adapted
for the Korean language.
Nori also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
The tokenizer supports multiple schema attributes:
* userDictionary: User dictionary path.
* userDictionaryEncoding: User dictionary encoding.
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
-->
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
listing the tags to remove. By default it removes:
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
This is basically an equivalent to stemming.
-->
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- Latvian -->
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>

View File

@ -996,6 +996,40 @@
</analyzer>
</fieldType>
<!-- Korean morphological analysis -->
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
This dictionary was built with MeCab, it defines a format for the features adapted
for the Korean language.
Nori also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
The tokenizer supports multiple schema attributes:
* userDictionary: User dictionary path.
* userDictionaryEncoding: User dictionary encoding.
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
-->
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
listing the tags to remove. By default it removes:
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
This is basically an equivalent to stemming.
-->
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- Latvian -->
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>

View File

@ -322,6 +322,14 @@
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
@ -470,6 +478,7 @@
<dynamicField name="*_txt_id" type="text_id" indexed="true" stored="true"/>
<dynamicField name="*_txt_it" type="text_it" indexed="true" stored="true"/>
<dynamicField name="*_txt_ja" type="text_ja" indexed="true" stored="true"/>
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
<dynamicField name="*_txt_nl" type="text_nl" indexed="true" stored="true"/>
<dynamicField name="*_txt_no" type="text_no" indexed="true" stored="true"/>

View File

@ -849,6 +849,40 @@
</analyzer>
</fieldType>
<!-- Korean morphological analysis -->
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
This dictionary was built with MeCab, it defines a format for the features adapted
for the Korean language.
Nori also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
The tokenizer supports multiple schema attributes:
* userDictionary: User dictionary path.
* userDictionaryEncoding: User dictionary encoding.
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
-->
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
listing the tags to remove. By default it removes:
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
This is basically an equivalent to stemming.
-->
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- Latvian -->
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">

View File

@ -1032,6 +1032,40 @@
</analyzer>
</fieldType>
<!-- Korean morphological analysis -->
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
This dictionary was built with MeCab, it defines a format for the features adapted
for the Korean language.
Nori also has a convenient user dictionary feature that allows overriding the statistical
model with your own entries for segmentation, part-of-speech tags and readings without a need
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
The tokenizer supports multiple schema attributes:
* userDictionary: User dictionary path.
* userDictionaryEncoding: User dictionary encoding.
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
-->
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
listing the tags to remove. By default it removes:
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
This is basically an equivalent to stemming.
-->
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
<filter class="solr.KoreanReadingFormFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- Latvian -->
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>