add some more kuromoji javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303746 13f79535-47bb-0310-9956-ffa450edef68
2012-03-22 12:21:48 +00:00 · 2012-03-22 12:21:48 +00:00 · c3305a50ff
parent d2eebf9330
commit c3305a50ff
2 changed files with 51 additions and 8 deletions
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
@ -35,6 +35,7 @@ import org.apache.lucene.util.Version;
 /**
 * Analyzer for Japanese that uses morphological analysis.
 * @see KuromojiTokenizer
 */
 public class KuromojiAnalyzer extends StopwordAnalyzerBase {
  private final Mode mode;
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
@ -47,23 +47,57 @@ import org.apache.lucene.util.fst.FST;
 // TODO: somehow factor out a reusable viterbi search here,
 // so other decompounders/tokenizers can reuse...
-/* Uses a rolling Viterbi search to find the least cost
+/**
- * segmentation (path) of the incoming characters.  For
+ * Tokenizer for Japanese that uses morphological analysis.
- * tokens that appear to be compound (> length 2 for all
+ * <p>
 * This tokenizer sets a number of additional attributes:
 * <ul>
 *   <li>{@link BaseFormAttribute} containing base form for inflected
 *       adjectives and verbs.
 *   <li>{@link PartOfSpeechAttribute} containing part-of-speech.
 *   <li>{@link ReadingAttribute} containing reading and pronunciation.
 *   <li>{@link InflectionAttribute} containing additional part-of-speech
 *       information for inflected forms.
 * </ul>
 * <p>
 * This tokenizer uses a rolling Viterbi search to find the 
 * least cost segmentation (path) of the incoming characters.  
 * For tokens that appear to be compound (> length 2 for all
 * Kanji, or > length 7 for non-Kanji), we see if there is a
 * 2nd best segmentation of that token after applying
 * penalties to the long tokens.  If so, and the Mode is
- * SEARCH_WITH_COMPOUND, we output the alternate
+ * {@link Mode#SEARCH}, we output the alternate segmentation 
- * segmentation as well. */
+ * as well.
 /**
 * Tokenizer for Japanese that uses morphological analysis.
 */
 public final class KuromojiTokenizer extends Tokenizer {
  /**
   * Tokenization mode: this determines how the tokenizer handles
   * compound and unknown words.
   */
  public static enum Mode {
-    NORMAL, SEARCH, EXTENDED
+    /**
     * Ordinary segmentation: no decomposition for compounds,
     */
    NORMAL, 
    /**
     * Segmentation geared towards search: this includes a 
     * decompounding process for long nouns, also including
     * the full compound token as a synonym.
     */
    SEARCH, 
    /**
     * Extended mode outputs unigrams for unknown words.
     * @lucene.experimental
     */
    EXTENDED
  }
  /**
   * Default tokenization mode. Currently this is {@link Mode#SEARCH}.
   */
  public static final Mode DEFAULT_MODE = Mode.SEARCH;
  enum Type {
@ -139,6 +173,14 @@ public final class KuromojiTokenizer extends Tokenizer {
  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
  /**
   * Create a new KuromojiTokenizer.
   * 
   * @param input Reader containing text
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
   */
  public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
    super(input);
    dictionary = TokenInfoDictionary.getInstance();