LUCENE-8453: Add documentation to analysis factories of Korean (Nori) analyzer module
This closes #434
This commit is contained in:
Uwe Schindler 2018-08-11 12:50:19 +02:00
commit e9addea087
4 changed files with 49 additions and 1 deletions

View File

@ -254,6 +254,9 @@ Other:
* LUCENE-8420: Upgrade OpenNLP to 1.9.0 so OpenNLP tool can read the new model format which 1.8.x
cannot read. 1.9.0 can read the old format. (Koji Sekiguchi)
* LUCENE-8453: Add documentation to analysis factories of Korean (Nori) analyzer
module. (Tomoko Uchida via Uwe Schindler)
======================= Lucene 7.4.1 =======================
Bug Fixes:

View File

@ -25,12 +25,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link KoreanPartOfSpeechStopFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ko" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KoreanTokenizerFactory"/&gt;
* &lt;filter class="solr.KoreanPartOfSpeechStopFilterFactory"
* tags="E,J"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*
* <p>
* Supports the following attributes:
* <ul>
* <li>tags: List of stop tags. if not specified, {@link KoreanPartOfSpeechStopFilter#DEFAULT_STOP_TAGS} is used.</li>
* </ul>
* @lucene.experimental
*/
public class KoreanPartOfSpeechStopFilterFactory extends TokenFilterFactory {
private Set<POS.Tag> stopTags;
/** Creates a new JapanesePartOfSpeechStopFilterFactory */
/** Creates a new KoreanPartOfSpeechStopFilterFactory */
public KoreanPartOfSpeechStopFilterFactory(Map<String,String> args) {
super(args);
Set<String> stopTagStr = getSet(args, "tags");

View File

@ -23,6 +23,14 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link KoreanReadingFormFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ko" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KoreanTokenizerFactory"/&gt;
* &lt;filter class="solr.KoreanReadingFormFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
* @lucene.experimental
*/
public class KoreanReadingFormFilterFactory extends TokenFilterFactory {

View File

@ -32,9 +32,31 @@ import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
/**
* Factory for {@link KoreanTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ko" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KoreanTokenizerFactory"
* decompoundMode="discard"
* userDictionary="user.txt"
* userDictionaryEncoding="UTF-8"
* outputUnknownUnigrams="false"
* /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*
* <p>
* Supports the following attributes:
* <ul>
* <li>userDictionary: User dictionary path.</li>
* <li>userDictionaryEncoding: User dictionary encoding.</li>
* <li>decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See {@link DecompoundMode}</li>
* <li>outputUnknownUnigrams: If true outputs unigrams for unknown words.</li>
* </ul>
* @lucene.experimental
*/
public class KoreanTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {