mirror of https://github.com/apache/lucene.git
add some more kuromoji javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303746 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d2eebf9330
commit
c3305a50ff
|
@ -35,6 +35,7 @@ import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Japanese that uses morphological analysis.
|
* Analyzer for Japanese that uses morphological analysis.
|
||||||
|
* @see KuromojiTokenizer
|
||||||
*/
|
*/
|
||||||
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Mode mode;
|
private final Mode mode;
|
||||||
|
|
|
@ -47,23 +47,57 @@ import org.apache.lucene.util.fst.FST;
|
||||||
// TODO: somehow factor out a reusable viterbi search here,
|
// TODO: somehow factor out a reusable viterbi search here,
|
||||||
// so other decompounders/tokenizers can reuse...
|
// so other decompounders/tokenizers can reuse...
|
||||||
|
|
||||||
/* Uses a rolling Viterbi search to find the least cost
|
/**
|
||||||
* segmentation (path) of the incoming characters. For
|
* Tokenizer for Japanese that uses morphological analysis.
|
||||||
* tokens that appear to be compound (> length 2 for all
|
* <p>
|
||||||
|
* This tokenizer sets a number of additional attributes:
|
||||||
|
* <ul>
|
||||||
|
* <li>{@link BaseFormAttribute} containing base form for inflected
|
||||||
|
* adjectives and verbs.
|
||||||
|
* <li>{@link PartOfSpeechAttribute} containing part-of-speech.
|
||||||
|
* <li>{@link ReadingAttribute} containing reading and pronunciation.
|
||||||
|
* <li>{@link InflectionAttribute} containing additional part-of-speech
|
||||||
|
* information for inflected forms.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* This tokenizer uses a rolling Viterbi search to find the
|
||||||
|
* least cost segmentation (path) of the incoming characters.
|
||||||
|
* For tokens that appear to be compound (> length 2 for all
|
||||||
* Kanji, or > length 7 for non-Kanji), we see if there is a
|
* Kanji, or > length 7 for non-Kanji), we see if there is a
|
||||||
* 2nd best segmentation of that token after applying
|
* 2nd best segmentation of that token after applying
|
||||||
* penalties to the long tokens. If so, and the Mode is
|
* penalties to the long tokens. If so, and the Mode is
|
||||||
* SEARCH_WITH_COMPOUND, we output the alternate
|
* {@link Mode#SEARCH}, we output the alternate segmentation
|
||||||
* segmentation as well. */
|
* as well.
|
||||||
/**
|
|
||||||
* Tokenizer for Japanese that uses morphological analysis.
|
|
||||||
*/
|
*/
|
||||||
public final class KuromojiTokenizer extends Tokenizer {
|
public final class KuromojiTokenizer extends Tokenizer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tokenization mode: this determines how the tokenizer handles
|
||||||
|
* compound and unknown words.
|
||||||
|
*/
|
||||||
public static enum Mode {
|
public static enum Mode {
|
||||||
NORMAL, SEARCH, EXTENDED
|
/**
|
||||||
|
* Ordinary segmentation: no decomposition for compounds,
|
||||||
|
*/
|
||||||
|
NORMAL,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Segmentation geared towards search: this includes a
|
||||||
|
* decompounding process for long nouns, also including
|
||||||
|
* the full compound token as a synonym.
|
||||||
|
*/
|
||||||
|
SEARCH,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extended mode outputs unigrams for unknown words.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
EXTENDED
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default tokenization mode. Currently this is {@link Mode#SEARCH}.
|
||||||
|
*/
|
||||||
public static final Mode DEFAULT_MODE = Mode.SEARCH;
|
public static final Mode DEFAULT_MODE = Mode.SEARCH;
|
||||||
|
|
||||||
enum Type {
|
enum Type {
|
||||||
|
@ -139,6 +173,14 @@ public final class KuromojiTokenizer extends Tokenizer {
|
||||||
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
|
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
|
||||||
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
|
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new KuromojiTokenizer.
|
||||||
|
*
|
||||||
|
* @param input Reader containing text
|
||||||
|
* @param userDictionary Optional: if non-null, user dictionary.
|
||||||
|
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
|
||||||
|
* @param mode tokenization mode.
|
||||||
|
*/
|
||||||
public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
|
public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
|
||||||
super(input);
|
super(input);
|
||||||
dictionary = TokenInfoDictionary.getInstance();
|
dictionary = TokenInfoDictionary.getInstance();
|
||||||
|
|
Loading…
Reference in New Issue