LUCENE-8971: Enable constructing JapaneseTokenizer with custom dictionary

2019-09-06 08:34:32 -04:00 · 2019-09-06 08:34:32 -04:00 · 770464ec20
parent c514b29b24
commit 770464ec20
6 changed files with 86 additions and 11 deletions
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@ -202,7 +202,7 @@ public final class JapaneseTokenizer extends Tokenizer {
  }

  /**
-   * Create a new JapaneseTokenizer.
+   * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
   *
   * @param factory the AttributeFactory to use
   * @param userDictionary Optional: if non-null, user dictionary.
@ -211,13 +211,40 @@ public final class JapaneseTokenizer extends Tokenizer {
   */
  public JapaneseTokenizer
      (AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
+    this(factory,
+         TokenInfoDictionary.getInstance(),
+         UnknownDictionary.getInstance(),
+         ConnectionCosts.getInstance(),
+         userDictionary, discardPunctuation, mode);
+  }
+
+  /**
+   * Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
+   * <p>
+   * Uses the default AttributeFactory.
+   *
+   * @param factory the AttributeFactory to use
+   * @param systemDictionary a custom known token dictionary
+   * @param unkDictionary a custom unknown token dictionary
+   * @param connectionCosts custom token transition costs
+   * @param userDictionary Optional: if non-null, user dictionary.
+   * @param discardPunctuation true if punctuation tokens should be dropped from the output.
+   * @param mode tokenization mode.
+   */
+  public JapaneseTokenizer(AttributeFactory factory,
+                           TokenInfoDictionary systemDictionary,
+                           UnknownDictionary unkDictionary,
+                           ConnectionCosts connectionCosts,
+                           UserDictionary userDictionary,
+                           boolean discardPunctuation,
+                           Mode mode) {
    super(factory);
-    dictionary = TokenInfoDictionary.getInstance();
-    fst = dictionary.getFST();
-    unkDictionary = UnknownDictionary.getInstance();
-    characterDefinition = unkDictionary.getCharacterDefinition();
+    this.dictionary = systemDictionary;
+    this.fst = dictionary.getFST();
+    this.unkDictionary = unkDictionary;
+    this.characterDefinition = unkDictionary.getCharacterDefinition();
    this.userDictionary = userDictionary;
-    costs = ConnectionCosts.getInstance();
+    this.costs = connectionCosts;
    fstReader = fst.getBytesReader();
    if (userDictionary != null) {
      userFST = userDictionary.getFST();
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
@ -176,6 +176,17 @@ public abstract class BinaryDictionary implements Dictionary {
    }
  }
  
+  public static final InputStream getResource(ResourceScheme scheme, String path) throws IOException {
+    switch(scheme) {
+      case CLASSPATH:
+        return getClassResource(path);
+      case FILE:
+        return Files.newInputStream(Paths.get(path));
+      default:
+        throw new IllegalStateException("unknown resource scheme " + scheme);
+    }
+  }
+
  // util, reused by ConnectionCosts and CharacterDefinition
  public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
    final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
@ -185,7 +196,7 @@ public abstract class BinaryDictionary implements Dictionary {
    return is;
  }
  
-  private InputStream getClassResource(String path) throws IOException {
+  private static InputStream getClassResource(String path) throws IOException {
    final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
    if (is == null) {
      throw new FileNotFoundException("Not in classpath: " + path);
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java
@ -37,12 +37,16 @@ public final class ConnectionCosts {
  
  private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
  
-  private ConnectionCosts() throws IOException {
+  /**
+   * @param scheme - scheme for loading resources (FILE or CLASSPATH).
+   * @param path - where to load resources from, without the ".dat" suffix
+   */
+  public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String path) throws IOException {
    InputStream is = null;
    short[][] costs = null;
    boolean success = false;
    try {
-      is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
+      is = BinaryDictionary.getResource(scheme, path.replace('.', '/') + FILENAME_SUFFIX);
      is = new BufferedInputStream(is);
      final DataInput in = new InputStreamDataInput(is);
      CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
@ -68,7 +72,11 @@ public final class ConnectionCosts {
    
    this.costs = costs;
  }
-  
+
+  private ConnectionCosts() throws IOException {
+    this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
+  }
+
  public int get(int forwardId, int backwardId) {
    return costs[backwardId][forwardId];
  }
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
@ -40,7 +40,7 @@ public final class TokenInfoDictionary extends BinaryDictionary {
   * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
   * this class's name as the path.
   */
-  TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
+  public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
    super(resourceScheme, resourcePath);
    FST<Long> fst;
    try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java
@ -26,6 +26,15 @@ public final class UnknownDictionary extends BinaryDictionary {

  private final CharacterDefinition characterDefinition = CharacterDefinition.getInstance();
  
+  /**
+   * @param scheme scheme for loading resources (FILE or CLASSPATH).
+   * @param path where to load resources from; a path, including the file base name without
+   * extension; this is used to match multiple files with the same base name.
+   */
+  public UnknownDictionary(ResourceScheme scheme, String path) throws IOException {
+    super(scheme, path);
+  }
+
  private UnknownDictionary() throws IOException {
    super();
  }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -33,7 +33,10 @@ import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
+import org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
 import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
+import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.apache.lucene.analysis.ja.tokenattributes.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -441,6 +444,23 @@ public class
    );
  }

+  // Make sure loading custom dictionaries from classpath works:
+  public void testCustomDictionary() throws Exception {
+    Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(),
+        new TokenInfoDictionary(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/TokenInfoDictionary"),
+        new UnknownDictionary(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/UnknownDictionary"),
+        new ConnectionCosts(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/ConnectionCosts"),
+        readDict(), true, Mode.SEARCH);
+    try (Analyzer a = makeAnalyzer(tokenizer)) {
+      assertTokenStreamContents(a.tokenStream("foo", "abcd"),
+                                new String[] { "a", "b", "cd"  },
+                                new int[] { 0, 1, 2 },
+                                new int[] { 1, 2, 4 },
+                                4
+                                );
+    }
+  }
+
  // HMM: fails (segments as a/b/cd/efghij)... because the
  // two paths have exactly equal paths (1 KNOWN + 1
  // UNKNOWN) and we don't seem to favor longer KNOWN /