mirror of https://github.com/apache/lucene.git
SOLR-12655: Add Korean morphological analyzer ("nori") to default distribution. This also adds examples for configuration in Solr's schema
This commit is contained in:
parent
f2c0005e9d
commit
928b92caa0
|
@ -155,6 +155,9 @@ New Features
|
|||
* SOLR-12485: Uploading docs in XML now supports child documents as field values, thus providing a label to the
|
||||
relationship instead of the current "anonymous" relationship. (Moshe Bla, David Smiley)
|
||||
|
||||
* SOLR-12655: Add Korean morphological analyzer ("nori") to default distribution. This also adds examples
|
||||
for configuration in Solr's schema. (Uwe Schindler)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -94,6 +94,7 @@
|
|||
-->
|
||||
<pathelement location="${analyzers-common.jar}"/>
|
||||
<pathelement location="${analyzers-kuromoji.jar}"/>
|
||||
<pathelement location="${analyzers-nori.jar}"/>
|
||||
<pathelement location="${analyzers-phonetic.jar}"/>
|
||||
<pathelement location="${codecs.jar}"/>
|
||||
<pathelement location="${backward-codecs.jar}"/>
|
||||
|
@ -171,7 +172,7 @@
|
|||
|
||||
<target name="prep-lucene-jars"
|
||||
depends="resolve-groovy,
|
||||
jar-lucene-core, jar-backward-codecs, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-codecs,jar-expressions, jar-suggest, jar-highlighter, jar-memory,
|
||||
jar-lucene-core, jar-backward-codecs, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-analyzers-nori, jar-codecs,jar-expressions, jar-suggest, jar-highlighter, jar-memory,
|
||||
jar-misc, jar-spatial-extras, jar-spatial3d, jar-grouping, jar-queries, jar-queryparser, jar-join, jar-sandbox, jar-classification">
|
||||
<property name="solr.deps.compiled" value="true"/>
|
||||
</target>
|
||||
|
@ -248,7 +249,7 @@
|
|||
<property name="lucenedocs" location="${common.dir}/build/docs"/>
|
||||
|
||||
<!-- dependency to ensure all lucene javadocs are present -->
|
||||
<target name="lucene-javadocs" depends="javadocs-lucene-core,javadocs-analyzers-common,javadocs-analyzers-icu,javadocs-analyzers-kuromoji,javadocs-analyzers-phonetic,javadocs-analyzers-smartcn,javadocs-analyzers-morfologik,javadocs-analyzers-stempel,javadocs-backward-codecs,javadocs-codecs,javadocs-expressions,javadocs-suggest,javadocs-grouping,javadocs-queries,javadocs-queryparser,javadocs-highlighter,javadocs-memory,javadocs-misc,javadocs-spatial-extras,javadocs-join,javadocs-test-framework"/>
|
||||
<target name="lucene-javadocs" depends="javadocs-lucene-core,javadocs-analyzers-common,javadocs-analyzers-icu,javadocs-analyzers-kuromoji,javadocs-analyzers-nori,javadocs-analyzers-phonetic,javadocs-analyzers-smartcn,javadocs-analyzers-morfologik,javadocs-analyzers-stempel,javadocs-backward-codecs,javadocs-codecs,javadocs-expressions,javadocs-suggest,javadocs-grouping,javadocs-queries,javadocs-queryparser,javadocs-highlighter,javadocs-memory,javadocs-misc,javadocs-spatial-extras,javadocs-join,javadocs-test-framework"/>
|
||||
|
||||
<!-- create javadocs for the current module -->
|
||||
<target name="javadocs" depends="compile-core,define-lucene-javadoc-url,lucene-javadocs,javadocs-solr-core,check-javadocs-uptodate" unless="javadocs-uptodate-${name}">
|
||||
|
@ -309,6 +310,7 @@
|
|||
<link offline="true" href="${lucene.javadoc.url}analyzers-common" packagelistloc="${lucenedocs}/analyzers-common"/>
|
||||
<link offline="true" href="${lucene.javadoc.url}analyzers-icu" packagelistloc="${lucenedocs}/analyzers-icu"/>
|
||||
<link offline="true" href="${lucene.javadoc.url}analyzers-kuromoji" packagelistloc="${lucenedocs}/analyzers-kuromoji"/>
|
||||
<link offline="true" href="${lucene.javadoc.url}analyzers-nori" packagelistloc="${lucenedocs}/analyzers-nori"/>
|
||||
<link offline="true" href="${lucene.javadoc.url}analyzers-morfologik" packagelistloc="${lucenedocs}/analyzers-morfologik"/>
|
||||
<link offline="true" href="${lucene.javadoc.url}analyzers-phonetic" packagelistloc="${lucenedocs}/analyzers-phonetic"/>
|
||||
<link offline="true" href="${lucene.javadoc.url}analyzers-smartcn" packagelistloc="${lucenedocs}/analyzers-smartcn"/>
|
||||
|
|
|
@ -849,6 +849,40 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Korean morphological analysis -->
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
|
||||
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
|
||||
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
|
||||
|
||||
This dictionary was built with MeCab, it defines a format for the features adapted
|
||||
for the Korean language.
|
||||
|
||||
Nori also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
The tokenizer supports multiple schema attributes:
|
||||
* userDictionary: User dictionary path.
|
||||
* userDictionaryEncoding: User dictionary encoding.
|
||||
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
|
||||
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
|
||||
-->
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
|
||||
listing the tags to remove. By default it removes:
|
||||
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
|
||||
This is basically an equivalent to stemming.
|
||||
-->
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Latvian -->
|
||||
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
|
|
|
@ -996,6 +996,40 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Korean morphological analysis -->
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
|
||||
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
|
||||
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
|
||||
|
||||
This dictionary was built with MeCab, it defines a format for the features adapted
|
||||
for the Korean language.
|
||||
|
||||
Nori also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
The tokenizer supports multiple schema attributes:
|
||||
* userDictionary: User dictionary path.
|
||||
* userDictionaryEncoding: User dictionary encoding.
|
||||
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
|
||||
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
|
||||
-->
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
|
||||
listing the tags to remove. By default it removes:
|
||||
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
|
||||
This is basically an equivalent to stemming.
|
||||
-->
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Latvian -->
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
|
|
|
@ -915,6 +915,40 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Korean morphological analysis -->
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
|
||||
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
|
||||
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
|
||||
|
||||
This dictionary was built with MeCab, it defines a format for the features adapted
|
||||
for the Korean language.
|
||||
|
||||
Nori also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
The tokenizer supports multiple schema attributes:
|
||||
* userDictionary: User dictionary path.
|
||||
* userDictionaryEncoding: User dictionary encoding.
|
||||
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
|
||||
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
|
||||
-->
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
|
||||
listing the tags to remove. By default it removes:
|
||||
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
|
||||
This is basically an equivalent to stemming.
|
||||
-->
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Latvian -->
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
|
|
|
@ -996,6 +996,40 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Korean morphological analysis -->
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
|
||||
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
|
||||
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
|
||||
|
||||
This dictionary was built with MeCab, it defines a format for the features adapted
|
||||
for the Korean language.
|
||||
|
||||
Nori also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
The tokenizer supports multiple schema attributes:
|
||||
* userDictionary: User dictionary path.
|
||||
* userDictionaryEncoding: User dictionary encoding.
|
||||
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
|
||||
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
|
||||
-->
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
|
||||
listing the tags to remove. By default it removes:
|
||||
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
|
||||
This is basically an equivalent to stemming.
|
||||
-->
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Latvian -->
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
|
|
|
@ -322,6 +322,14 @@
|
|||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
|
@ -470,6 +478,7 @@
|
|||
<dynamicField name="*_txt_id" type="text_id" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_txt_it" type="text_it" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_txt_ja" type="text_ja" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_txt_nl" type="text_nl" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_txt_no" type="text_no" indexed="true" stored="true"/>
|
||||
|
|
|
@ -849,6 +849,40 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Korean morphological analysis -->
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
|
||||
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
|
||||
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
|
||||
|
||||
This dictionary was built with MeCab, it defines a format for the features adapted
|
||||
for the Korean language.
|
||||
|
||||
Nori also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
The tokenizer supports multiple schema attributes:
|
||||
* userDictionary: User dictionary path.
|
||||
* userDictionaryEncoding: User dictionary encoding.
|
||||
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
|
||||
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
|
||||
-->
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
|
||||
listing the tags to remove. By default it removes:
|
||||
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
|
||||
This is basically an equivalent to stemming.
|
||||
-->
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Latvian -->
|
||||
<dynamicField name="*_txt_lv" type="text_lv" indexed="true" stored="true"/>
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
|
|
|
@ -1032,6 +1032,40 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Korean morphological analysis -->
|
||||
<dynamicField name="*_txt_ko" type="text_ko" indexed="true" stored="true"/>
|
||||
<fieldType name="text_ko" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<!-- Nori Korean morphological analyzer/tokenizer (KoreanTokenizer)
|
||||
The Korean (nori) analyzer integrates Lucene nori analysis module into Solr.
|
||||
It uses the mecab-ko-dic dictionary to perform morphological analysis of Korean texts.
|
||||
|
||||
This dictionary was built with MeCab, it defines a format for the features adapted
|
||||
for the Korean language.
|
||||
|
||||
Nori also has a convenient user dictionary feature that allows overriding the statistical
|
||||
model with your own entries for segmentation, part-of-speech tags and readings without a need
|
||||
to specify weights. Notice that user dictionaries have not been subject to extensive testing.
|
||||
|
||||
The tokenizer supports multiple schema attributes:
|
||||
* userDictionary: User dictionary path.
|
||||
* userDictionaryEncoding: User dictionary encoding.
|
||||
* decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is 'discard'.
|
||||
* outputUnknownUnigrams: If true outputs unigrams for unknown words.
|
||||
-->
|
||||
<tokenizer class="solr.KoreanTokenizerFactory" decompoundMode="discard" outputUnknownUnigrams="false"/>
|
||||
<!-- Removes some part of speech stuff like EOMI (Pos.E), you can add a parameter 'tags',
|
||||
listing the tags to remove. By default it removes:
|
||||
E, IC, J, MAG, MAJ, MM, SP, SSC, SSO, SC, SE, XPN, XSA, XSN, XSV, UNA, NA, VSV
|
||||
This is basically an equivalent to stemming.
|
||||
-->
|
||||
<filter class="solr.KoreanPartOfSpeechStopFilterFactory" />
|
||||
<!-- Replaces term text with the Hangul transcription of Hanja characters, if applicable: -->
|
||||
<filter class="solr.KoreanReadingFormFilterFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Latvian -->
|
||||
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
|
|
Loading…
Reference in New Issue