From c3305a50ff27affde4a1b846a172bb0b50873089 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 22 Mar 2012 12:21:48 +0000
Subject: [PATCH] add some more kuromoji javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303746 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/kuromoji/KuromojiAnalyzer.java   |  1 +
 .../analysis/kuromoji/KuromojiTokenizer.java  | 58 ++++++++++++++++---
 2 files changed, 51 insertions(+), 8 deletions(-)
diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
index 1689e410c96..e32ba592fb4 100644
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
@@ -35,6 +35,7 @@ import org.apache.lucene.util.Version;
 
 /**
  * Analyzer for Japanese that uses morphological analysis.
+ * @see KuromojiTokenizer
  */
 public class KuromojiAnalyzer extends StopwordAnalyzerBase {
   private final Mode mode;
diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
index b720006a2a2..c3152837f25 100644
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
@@ -47,23 +47,57 @@ import org.apache.lucene.util.fst.FST;
 // TODO: somehow factor out a reusable viterbi search here,
 // so other decompounders/tokenizers can reuse...
 
-/* Uses a rolling Viterbi search to find the least cost
- * segmentation (path) of the incoming characters.  For
- * tokens that appear to be compound (> length 2 for all
+/**
+ * Tokenizer for Japanese that uses morphological analysis.
+ * <p>
+ * This tokenizer sets a number of additional attributes:
+ * <ul>
+ *   <li>{@link BaseFormAttribute} containing base form for inflected
+ *       adjectives and verbs.
+ *   <li>{@link PartOfSpeechAttribute} containing part-of-speech.
+ *   <li>{@link ReadingAttribute} containing reading and pronunciation.
+ *   <li>{@link InflectionAttribute} containing additional part-of-speech
+ *       information for inflected forms.
+ * </ul>
+ * <p>
+ * This tokenizer uses a rolling Viterbi search to find the 
+ * least cost segmentation (path) of the incoming characters.  
+ * For tokens that appear to be compound (> length 2 for all
  * Kanji, or > length 7 for non-Kanji), we see if there is a
  * 2nd best segmentation of that token after applying
  * penalties to the long tokens.  If so, and the Mode is
- * SEARCH_WITH_COMPOUND, we output the alternate
- * segmentation as well. */
-/**
- * Tokenizer for Japanese that uses morphological analysis.
+ * {@link Mode#SEARCH}, we output the alternate segmentation 
+ * as well.
  */
 public final class KuromojiTokenizer extends Tokenizer {
 
+  /**
+   * Tokenization mode: this determines how the tokenizer handles
+   * compound and unknown words.
+   */
   public static enum Mode {
-    NORMAL, SEARCH, EXTENDED
+    /**
+     * Ordinary segmentation: no decomposition for compounds,
+     */
+    NORMAL, 
+
+    /**
+     * Segmentation geared towards search: this includes a 
+     * decompounding process for long nouns, also including
+     * the full compound token as a synonym.
+     */
+    SEARCH, 
+
+    /**
+     * Extended mode outputs unigrams for unknown words.
+     * @lucene.experimental
+     */
+    EXTENDED
   }
 
+  /**
+   * Default tokenization mode. Currently this is {@link Mode#SEARCH}.
+   */
   public static final Mode DEFAULT_MODE = Mode.SEARCH;
 
   enum Type {
@@ -139,6 +173,14 @@ public final class KuromojiTokenizer extends Tokenizer {
   private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
   private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
 
+  /**
+   * Create a new KuromojiTokenizer.
+   * 
+   * @param input Reader containing text
+   * @param userDictionary Optional: if non-null, user dictionary.
+   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
+   * @param mode tokenization mode.
+   */
   public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
     super(input);
     dictionary = TokenInfoDictionary.getInstance();