Merge branch 'jira/lucene-8453' of https://github.com/mocobeta/lucene-solr-mirror

LUCENE-8453: Add documentation to analysis factories of Korean (Nori) analyzer module This closes #434
2025-02-09 03:25:15 +00:00 · 2018-08-11 12:50:19 +02:00 · 2018-08-11 12:50:19 +02:00 · e9addea087
commit e9addea087
parent cdc0959afc f64f243ef0
4 changed files with 49 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -254,6 +254,9 @@ Other:
 * LUCENE-8420: Upgrade OpenNLP to 1.9.0 so OpenNLP tool can read the new model format which 1.8.x
  cannot read. 1.9.0 can read the old format. (Koji Sekiguchi)

+* LUCENE-8453: Add documentation to analysis factories of Korean (Nori) analyzer
+  module.  (Tomoko Uchida via Uwe Schindler)
+
 ======================= Lucene 7.4.1 =======================

 Bug Fixes:
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilterFactory.java
@ -25,12 +25,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;

 /**
 * Factory for {@link KoreanPartOfSpeechStopFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ko" class="solr.TextField"&gt;
+ *    &lt;analyzer&gt;
+ *      &lt;tokenizer class="solr.KoreanTokenizerFactory"/&gt;
+ *      &lt;filter class="solr.KoreanPartOfSpeechStopFilterFactory"
+ *              tags="E,J"/&gt;
+ *    &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ *
+ * <p>
+ * Supports the following attributes:
+ * <ul>
+ *   <li>tags: List of stop tags. if not specified, {@link KoreanPartOfSpeechStopFilter#DEFAULT_STOP_TAGS} is used.</li>
+ * </ul>
 * @lucene.experimental
 */
 public class KoreanPartOfSpeechStopFilterFactory extends TokenFilterFactory {
  private Set<POS.Tag> stopTags;

-  /** Creates a new JapanesePartOfSpeechStopFilterFactory */
+  /** Creates a new KoreanPartOfSpeechStopFilterFactory */
  public KoreanPartOfSpeechStopFilterFactory(Map<String,String> args) {
    super(args);
    Set<String> stopTagStr = getSet(args, "tags");
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilterFactory.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilterFactory.java
@ -23,6 +23,14 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;

 /**
 * Factory for {@link KoreanReadingFormFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ko" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.KoreanTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.KoreanReadingFormFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
 * @lucene.experimental
 */
 public class KoreanReadingFormFilterFactory extends TokenFilterFactory {
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java
@ -32,9 +32,31 @@ import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;

 /**
 * Factory for {@link KoreanTokenizer}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ko" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.KoreanTokenizerFactory"
+ *                decompoundMode="discard"
+ *                userDictionary="user.txt"
+ *                userDictionaryEncoding="UTF-8"
+ *                outputUnknownUnigrams="false"
+ *     /&gt;
+ *  &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ *
+ * <p>
+ * Supports the following attributes:
+ * <ul>
+ *   <li>userDictionary: User dictionary path.</li>
+ *   <li>userDictionaryEncoding: User dictionary encoding.</li>
+ *   <li>decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See {@link DecompoundMode}</li>
+ *   <li>outputUnknownUnigrams: If true outputs unigrams for unknown words.</li>
+ * </ul>
 * @lucene.experimental
 */
 public class KoreanTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {