mirror of
https://github.com/apache/lucene.git
synced 2025-02-09 03:25:15 +00:00
Merge branch 'jira/lucene-8453' of https://github.com/mocobeta/lucene-solr-mirror
LUCENE-8453: Add documentation to analysis factories of Korean (Nori) analyzer module This closes #434
This commit is contained in:
commit
e9addea087
@ -254,6 +254,9 @@ Other:
|
||||
* LUCENE-8420: Upgrade OpenNLP to 1.9.0 so OpenNLP tool can read the new model format which 1.8.x
|
||||
cannot read. 1.9.0 can read the old format. (Koji Sekiguchi)
|
||||
|
||||
* LUCENE-8453: Add documentation to analysis factories of Korean (Nori) analyzer
|
||||
module. (Tomoko Uchida via Uwe Schindler)
|
||||
|
||||
======================= Lucene 7.4.1 =======================
|
||||
|
||||
Bug Fixes:
|
||||
|
@ -25,12 +25,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link KoreanPartOfSpeechStopFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ko" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.KoreanTokenizerFactory"/>
|
||||
* <filter class="solr.KoreanPartOfSpeechStopFilterFactory"
|
||||
* tags="E,J"/>
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* Supports the following attributes:
|
||||
* <ul>
|
||||
* <li>tags: List of stop tags. if not specified, {@link KoreanPartOfSpeechStopFilter#DEFAULT_STOP_TAGS} is used.</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class KoreanPartOfSpeechStopFilterFactory extends TokenFilterFactory {
|
||||
private Set<POS.Tag> stopTags;
|
||||
|
||||
/** Creates a new JapanesePartOfSpeechStopFilterFactory */
|
||||
/** Creates a new KoreanPartOfSpeechStopFilterFactory */
|
||||
public KoreanPartOfSpeechStopFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
Set<String> stopTagStr = getSet(args, "tags");
|
||||
|
@ -23,6 +23,14 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link KoreanReadingFormFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ko" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.KoreanTokenizerFactory"/>
|
||||
* <filter class="solr.KoreanReadingFormFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class KoreanReadingFormFilterFactory extends TokenFilterFactory {
|
||||
|
@ -32,9 +32,31 @@ import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
|
||||
|
||||
/**
|
||||
* Factory for {@link KoreanTokenizer}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ko" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.KoreanTokenizerFactory"
|
||||
* decompoundMode="discard"
|
||||
* userDictionary="user.txt"
|
||||
* userDictionaryEncoding="UTF-8"
|
||||
* outputUnknownUnigrams="false"
|
||||
* />
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* Supports the following attributes:
|
||||
* <ul>
|
||||
* <li>userDictionary: User dictionary path.</li>
|
||||
* <li>userDictionaryEncoding: User dictionary encoding.</li>
|
||||
* <li>decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See {@link DecompoundMode}</li>
|
||||
* <li>outputUnknownUnigrams: If true outputs unigrams for unknown words.</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class KoreanTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
|
||||
|
Loading…
x
Reference in New Issue
Block a user