add some more kuromoji javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303746 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-03-22 12:21:48 +00:00
parent d2eebf9330
commit c3305a50ff
2 changed files with 51 additions and 8 deletions

View File

@ -35,6 +35,7 @@ import org.apache.lucene.util.Version;
/** /**
* Analyzer for Japanese that uses morphological analysis. * Analyzer for Japanese that uses morphological analysis.
* @see KuromojiTokenizer
*/ */
public class KuromojiAnalyzer extends StopwordAnalyzerBase { public class KuromojiAnalyzer extends StopwordAnalyzerBase {
private final Mode mode; private final Mode mode;

View File

@ -47,23 +47,57 @@ import org.apache.lucene.util.fst.FST;
// TODO: somehow factor out a reusable viterbi search here, // TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse... // so other decompounders/tokenizers can reuse...
/* Uses a rolling Viterbi search to find the least cost /**
* segmentation (path) of the incoming characters. For * Tokenizer for Japanese that uses morphological analysis.
* tokens that appear to be compound (> length 2 for all * <p>
* This tokenizer sets a number of additional attributes:
* <ul>
* <li>{@link BaseFormAttribute} containing base form for inflected
* adjectives and verbs.
* <li>{@link PartOfSpeechAttribute} containing part-of-speech.
* <li>{@link ReadingAttribute} containing reading and pronunciation.
* <li>{@link InflectionAttribute} containing additional part-of-speech
* information for inflected forms.
* </ul>
* <p>
* This tokenizer uses a rolling Viterbi search to find the
* least cost segmentation (path) of the incoming characters.
* For tokens that appear to be compound (> length 2 for all
* Kanji, or > length 7 for non-Kanji), we see if there is a * Kanji, or > length 7 for non-Kanji), we see if there is a
* 2nd best segmentation of that token after applying * 2nd best segmentation of that token after applying
* penalties to the long tokens. If so, and the Mode is * penalties to the long tokens. If so, and the Mode is
* SEARCH_WITH_COMPOUND, we output the alternate * {@link Mode#SEARCH}, we output the alternate segmentation
* segmentation as well. */ * as well.
/**
* Tokenizer for Japanese that uses morphological analysis.
*/ */
public final class KuromojiTokenizer extends Tokenizer { public final class KuromojiTokenizer extends Tokenizer {
/**
* Tokenization mode: this determines how the tokenizer handles
* compound and unknown words.
*/
public static enum Mode { public static enum Mode {
NORMAL, SEARCH, EXTENDED /**
* Ordinary segmentation: no decomposition for compounds,
*/
NORMAL,
/**
* Segmentation geared towards search: this includes a
* decompounding process for long nouns, also including
* the full compound token as a synonym.
*/
SEARCH,
/**
* Extended mode outputs unigrams for unknown words.
* @lucene.experimental
*/
EXTENDED
} }
/**
* Default tokenization mode. Currently this is {@link Mode#SEARCH}.
*/
public static final Mode DEFAULT_MODE = Mode.SEARCH; public static final Mode DEFAULT_MODE = Mode.SEARCH;
enum Type { enum Type {
@ -139,6 +173,14 @@ public final class KuromojiTokenizer extends Tokenizer {
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class); private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class); private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
/**
* Create a new KuromojiTokenizer.
*
* @param input Reader containing text
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param mode tokenization mode.
*/
public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
super(input); super(input);
dictionary = TokenInfoDictionary.getInstance(); dictionary = TokenInfoDictionary.getInstance();