LUCENE-8981: update Kuromoji javadocs, adding experimental tags to DictionaryBuilder and JapaneseTokenizer ctor

2025-02-10 20:15:18 +00:00 · 2019-09-16 13:27:37 -04:00 · 2019-09-16 13:27:37 -04:00 · 48307b5e82
commit 48307b5e82
parent fce0a5d45b
2 changed files with 15 additions and 4 deletions
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@ -219,9 +219,9 @@ public final class JapaneseTokenizer extends Tokenizer {
  }

  /**
-   * Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
-   * <p>
-   * Uses the default AttributeFactory.
+   * <p>Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
+   * This constructor provides an entry point for users that want to construct custom language models
+   * that can be used as input to {@link org.apache.lucene.analysis.ja.util.DictionaryBuilder}.</p>
   *
   * @param factory the AttributeFactory to use
   * @param systemDictionary a custom known token dictionary
@ -230,6 +230,7 @@ public final class JapaneseTokenizer extends Tokenizer {
   * @param userDictionary Optional: if non-null, user dictionary.
   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
   * @param mode tokenization mode.
+   * @lucene.experimental
   */
  public JapaneseTokenizer(AttributeFactory factory,
                           TokenInfoDictionary systemDictionary,
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
@ -23,7 +23,17 @@ import java.nio.file.Paths;
 import java.util.Locale;

 /**
- * Tool to build dictionaries.
+ * Tool to build dictionaries. Usage:
+ * <pre>
+ *    java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \
+ *          ${inputDir} ${outputDir} ${encoding}
+ * </pre>
+ *
+ * <p> The input directory is expected to include unk.def, matrix.def, plus any number of .csv
+ * files, roughly following the conventions of IPADIC. JapaneseTokenizer uses dictionaries built
+ * with this tool. Note that the input files required by this build generally must be generated from
+ * a corpus of real text using tools that are not part of Lucene.  </p>
+ * @lucene.experimenal
 */
 public class DictionaryBuilder {