From c3305a50ff27affde4a1b846a172bb0b50873089 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 22 Mar 2012 12:21:48 +0000 Subject: [PATCH] add some more kuromoji javadocs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303746 13f79535-47bb-0310-9956-ffa450edef68 --- .../analysis/kuromoji/KuromojiAnalyzer.java | 1 + .../analysis/kuromoji/KuromojiTokenizer.java | 58 ++++++++++++++++--- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java index 1689e410c96..e32ba592fb4 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java @@ -35,6 +35,7 @@ import org.apache.lucene.util.Version; /** * Analyzer for Japanese that uses morphological analysis. + * @see KuromojiTokenizer */ public class KuromojiAnalyzer extends StopwordAnalyzerBase { private final Mode mode; diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java index b720006a2a2..c3152837f25 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java @@ -47,23 +47,57 @@ import org.apache.lucene.util.fst.FST; // TODO: somehow factor out a reusable viterbi search here, // so other decompounders/tokenizers can reuse... -/* Uses a rolling Viterbi search to find the least cost - * segmentation (path) of the incoming characters. For - * tokens that appear to be compound (> length 2 for all +/** + * Tokenizer for Japanese that uses morphological analysis. + *

+ * This tokenizer sets a number of additional attributes: + *

+ *

+ * This tokenizer uses a rolling Viterbi search to find the + * least cost segmentation (path) of the incoming characters. + * For tokens that appear to be compound (> length 2 for all * Kanji, or > length 7 for non-Kanji), we see if there is a * 2nd best segmentation of that token after applying * penalties to the long tokens. If so, and the Mode is - * SEARCH_WITH_COMPOUND, we output the alternate - * segmentation as well. */ -/** - * Tokenizer for Japanese that uses morphological analysis. + * {@link Mode#SEARCH}, we output the alternate segmentation + * as well. */ public final class KuromojiTokenizer extends Tokenizer { + /** + * Tokenization mode: this determines how the tokenizer handles + * compound and unknown words. + */ public static enum Mode { - NORMAL, SEARCH, EXTENDED + /** + * Ordinary segmentation: no decomposition for compounds, + */ + NORMAL, + + /** + * Segmentation geared towards search: this includes a + * decompounding process for long nouns, also including + * the full compound token as a synonym. + */ + SEARCH, + + /** + * Extended mode outputs unigrams for unknown words. + * @lucene.experimental + */ + EXTENDED } + /** + * Default tokenization mode. Currently this is {@link Mode#SEARCH}. + */ public static final Mode DEFAULT_MODE = Mode.SEARCH; enum Type { @@ -139,6 +173,14 @@ public final class KuromojiTokenizer extends Tokenizer { private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class); private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class); + /** + * Create a new KuromojiTokenizer. + * + * @param input Reader containing text + * @param userDictionary Optional: if non-null, user dictionary. + * @param discardPunctuation true if punctuation tokens should be dropped from the output. + * @param mode tokenization mode. + */ public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { super(input); dictionary = TokenInfoDictionary.getInstance();