mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 19:15:06 +00:00
LUCENE-8981: update Kuromoji javadocs, adding experimental tags to DictionaryBuilder and JapaneseTokenizer ctor
This commit is contained in:
parent
b617769614
commit
93d3e5d666
@ -219,9 +219,9 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
|
* <p>Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
|
||||||
* <p>
|
* This constructor provides an entry point for users that want to construct custom language models
|
||||||
* Uses the default AttributeFactory.
|
* that can be used as input to {@link org.apache.lucene.analysis.ja.util.DictionaryBuilder}.</p>
|
||||||
*
|
*
|
||||||
* @param factory the AttributeFactory to use
|
* @param factory the AttributeFactory to use
|
||||||
* @param systemDictionary a custom known token dictionary
|
* @param systemDictionary a custom known token dictionary
|
||||||
@ -230,6 +230,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||||||
* @param userDictionary Optional: if non-null, user dictionary.
|
* @param userDictionary Optional: if non-null, user dictionary.
|
||||||
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
|
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
|
||||||
* @param mode tokenization mode.
|
* @param mode tokenization mode.
|
||||||
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public JapaneseTokenizer(AttributeFactory factory,
|
public JapaneseTokenizer(AttributeFactory factory,
|
||||||
TokenInfoDictionary systemDictionary,
|
TokenInfoDictionary systemDictionary,
|
||||||
|
@ -23,7 +23,17 @@ import java.nio.file.Paths;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tool to build dictionaries.
|
* Tool to build dictionaries. Usage:
|
||||||
|
* <pre>
|
||||||
|
* java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \
|
||||||
|
* ${inputDir} ${outputDir} ${encoding}
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* <p> The input directory is expected to include unk.def, matrix.def, plus any number of .csv
|
||||||
|
* files, roughly following the conventions of IPADIC. JapaneseTokenizer uses dictionaries built
|
||||||
|
* with this tool. Note that the input files required by this build generally must be generated from
|
||||||
|
* a corpus of real text using tools that are not part of Lucene. </p>
|
||||||
|
* @lucene.experimenal
|
||||||
*/
|
*/
|
||||||
public class DictionaryBuilder {
|
public class DictionaryBuilder {
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user