LUCENE-10400: revise binary dictionaries' constructor in nori (#693)

2025-02-10 20:15:18 +00:00 · 2022-02-20 16:16:56 +09:00 · 2022-02-20 16:16:56 +09:00 · 58fa95deea
commit 58fa95deea
parent f0d17e94d9
9 changed files with 292 additions and 123 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -77,7 +77,7 @@ API Changes
 * LUCENE-10368: IntTaxonomyFacets has been deprecated and is no longer a supported extension point
  for user-created faceting implementations. (Greg Miller)

-* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji:
+* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji and Nori:
  ConnectionCosts, TokenInfoDictionary, and UnknownDictionary. Old constructors that take resource scheme and
  resource path in those classes are deprecated; These are replaced with the new constructors and planned to be
  removed in a future release. (Tomoko Uchida, Uwe Schindler, Mike Sokolov)
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java
@ -18,25 +18,23 @@ package org.apache.lucene.analysis.ko.dict;

 import java.io.BufferedInputStream;
 import java.io.EOFException;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.file.Files;
-import java.nio.file.Paths;
 import org.apache.lucene.analysis.ko.POS;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.InputStreamDataInput;
-import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IOSupplier;
 import org.apache.lucene.util.IntsRef;

 /** Base class for a binary-encoded in-memory dictionary. */
 public abstract class BinaryDictionary implements Dictionary {

  /** Used to specify where (dictionary) resources get loaded from. */
+  @Deprecated(forRemoval = true, since = "9.1")
  public enum ResourceScheme {
    CLASSPATH,
    FILE
@ -51,75 +49,36 @@ public abstract class BinaryDictionary implements Dictionary {
  public static final String POSDICT_HEADER = "ko_dict_pos";
  public static final int VERSION = 1;

-  private final ResourceScheme resourceScheme;
-  private final String resourcePath;
  private final ByteBuffer buffer;
  private final int[] targetMapOffsets, targetMap;
  private final POS.Tag[] posDict;

-  protected BinaryDictionary() throws IOException {
-    this(ResourceScheme.CLASSPATH, null);
-  }
-
-  /**
-   * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
-   * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
-   *     scheme only, use this class's name as the path.
-   */
-  protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath)
+  protected BinaryDictionary(
+      IOSupplier<InputStream> targetMapResource,
+      IOSupplier<InputStream> posResource,
+      IOSupplier<InputStream> dictResource)
      throws IOException {
-    this.resourceScheme = resourceScheme;
-    if (resourcePath == null) {
-      if (resourceScheme != ResourceScheme.CLASSPATH) {
-        throw new IllegalArgumentException(
-            "resourcePath must be supplied with FILE resource scheme");
-      }
-      this.resourcePath = getClass().getSimpleName();
-    } else {
-      if (resourceScheme == ResourceScheme.CLASSPATH && !resourcePath.startsWith("/")) {
-        resourcePath = "/".concat(resourcePath);
-      }
-      this.resourcePath = resourcePath;
-    }
-    int[] targetMapOffsets, targetMap;
-    ByteBuffer buffer;
-    try (InputStream mapIS = new BufferedInputStream(getResource(TARGETMAP_FILENAME_SUFFIX));
-        InputStream posIS = new BufferedInputStream(getResource(POSDICT_FILENAME_SUFFIX));
-        // no buffering here, as we load in one large buffer
-        InputStream dictIS = getResource(DICT_FILENAME_SUFFIX)) {
+    try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
      DataInput in = new InputStreamDataInput(mapIS);
      CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
-      targetMap = new int[in.readVInt()];
-      targetMapOffsets = new int[in.readVInt()];
-      int accum = 0, sourceId = 0;
-      for (int ofs = 0; ofs < targetMap.length; ofs++) {
-        final int val = in.readVInt();
-        if ((val & 0x01) != 0) {
-          targetMapOffsets[sourceId] = ofs;
-          sourceId++;
-        }
-        accum += val >>> 1;
-        targetMap[ofs] = accum;
-      }
-      if (sourceId + 1 != targetMapOffsets.length)
-        throw new IOException(
-            "targetMap file format broken; targetMap.length="
-                + targetMap.length
-                + ", targetMapOffsets.length="
-                + targetMapOffsets.length
-                + ", sourceId="
-                + sourceId);
-      targetMapOffsets[sourceId] = targetMap.length;
+      this.targetMap = new int[in.readVInt()];
+      this.targetMapOffsets = new int[in.readVInt()];
+      populateTargetMap(in, this.targetMap, this.targetMapOffsets);
+    }

-      in = new InputStreamDataInput(posIS);
+    try (InputStream posIS = new BufferedInputStream(posResource.get())) {
+      DataInput in = new InputStreamDataInput(posIS);
      CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
      int posSize = in.readVInt();
-      posDict = new POS.Tag[posSize];
+      this.posDict = new POS.Tag[posSize];
      for (int j = 0; j < posSize; j++) {
        posDict[j] = POS.resolveTag(in.readByte());
      }
+    }

-      in = new InputStreamDataInput(dictIS);
+    // no buffering here, as we load in one large buffer
+    try (InputStream dictIS = dictResource.get()) {
+      DataInput in = new InputStreamDataInput(dictIS);
      CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
      final int size = in.readVInt();
      final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
@ -128,48 +87,31 @@ public abstract class BinaryDictionary implements Dictionary {
      if (read != size) {
        throw new EOFException("Cannot read whole dictionary");
      }
-      buffer = tmpBuffer.asReadOnlyBuffer();
-    }
-
-    this.targetMap = targetMap;
-    this.targetMapOffsets = targetMapOffsets;
-    this.buffer = buffer;
-  }
-
-  protected final InputStream getResource(String suffix) throws IOException {
-    switch (resourceScheme) {
-      case CLASSPATH:
-        return getClassResource(resourcePath + suffix);
-      case FILE:
-        return Files.newInputStream(Paths.get(resourcePath + suffix));
-      default:
-        throw new IllegalStateException("unknown resource scheme " + resourceScheme);
+      this.buffer = tmpBuffer.asReadOnlyBuffer();
    }
  }

-  public static InputStream getResource(ResourceScheme scheme, String path) throws IOException {
-    switch (scheme) {
-      case CLASSPATH:
-        return getClassResource(path);
-      case FILE:
-        return Files.newInputStream(Paths.get(path));
-      default:
-        throw new IllegalStateException("unknown resource scheme " + scheme);
+  private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
+      throws IOException {
+    int accum = 0, sourceId = 0;
+    for (int ofs = 0; ofs < targetMap.length; ofs++) {
+      final int val = in.readVInt();
+      if ((val & 0x01) != 0) {
+        targetMapOffsets[sourceId] = ofs;
+        sourceId++;
+      }
+      accum += val >>> 1;
+      targetMap[ofs] = accum;
    }
-  }
-
-  // util, reused by ConnectionCosts and CharacterDefinition
-  public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
-    final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
-    if (is == null) {
-      throw new FileNotFoundException(
-          "Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
-    }
-    return is;
-  }
-
-  private static InputStream getClassResource(String path) throws IOException {
-    return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path);
+    if (sourceId + 1 != targetMapOffsets.length)
+      throw new IOException(
+          "targetMap file format broken; targetMap.length="
+              + targetMap.length
+              + ", targetMapOffsets.length="
+              + targetMapOffsets.length
+              + ", sourceId="
+              + sourceId);
+    targetMapOffsets[sourceId] = targetMap.length;
  }

  public void lookupWordIds(int sourceId, IntsRef ref) {
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java
@ -73,11 +73,7 @@ public final class CharacterDefinition {
  public static final byte HANJANUMERIC = (byte) CharacterClass.HANJANUMERIC.ordinal();

  private CharacterDefinition() throws IOException {
-    InputStream is = null;
-    boolean success = false;
-    try {
-      is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
-      is = new BufferedInputStream(is);
+    try (InputStream is = new BufferedInputStream(getClassResource())) {
      final DataInput in = new InputStreamDataInput(is);
      CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
      in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
@ -86,16 +82,15 @@ public final class CharacterDefinition {
        invokeMap[i] = (b & 0x01) != 0;
        groupMap[i] = (b & 0x02) != 0;
      }
-      success = true;
-    } finally {
-      if (success) {
-        IOUtils.close(is);
-      } else {
-        IOUtils.closeWhileHandlingException(is);
-      }
    }
  }

+  private static InputStream getClassResource() throws IOException {
+    final String resourcePath = CharacterDefinition.class.getSimpleName() + FILENAME_SUFFIX;
+    return IOUtils.requireResourceNonNull(
+        CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath);
+  }
+
  public byte getCharacterClass(char c) {
    return characterCategoryMap[c];
  }
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java
@ -20,9 +20,14 @@ import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.IOSupplier;
+import org.apache.lucene.util.IOUtils;

 /** n-gram connection cost data */
 public final class ConnectionCosts {
@ -38,12 +43,32 @@ public final class ConnectionCosts {
   * @param scheme - scheme for loading resources (FILE or CLASSPATH).
   * @param resourcePath - where to load resources from, without the ".dat" suffix
   */
+  @Deprecated(forRemoval = true, since = "9.1")
+  @SuppressWarnings("removal")
  public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String resourcePath)
      throws IOException {
-    try (InputStream is =
-        new BufferedInputStream(
-            BinaryDictionary.getResource(
-                scheme, "/" + resourcePath.replace('.', '/') + FILENAME_SUFFIX))) {
+    this(
+        scheme == BinaryDictionary.ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + FILENAME_SUFFIX))
+            : ConnectionCosts::getClassResource);
+  }
+
+  /**
+   * Create a {@link ConnectionCosts} from an external resource path.
+   *
+   * @param connectionCostsFile where to load connection costs resource
+   * @throws IOException if resource was not found or broken
+   */
+  public ConnectionCosts(Path connectionCostsFile) throws IOException {
+    this(() -> Files.newInputStream(connectionCostsFile));
+  }
+
+  private ConnectionCosts() throws IOException {
+    this(ConnectionCosts::getClassResource);
+  }
+
+  private ConnectionCosts(IOSupplier<InputStream> connectionCostResource) throws IOException {
+    try (InputStream is = new BufferedInputStream(connectionCostResource.get())) {
      final DataInput in = new InputStreamDataInput(is);
      CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
      this.forwardSize = in.readVInt();
@ -63,8 +88,10 @@ public final class ConnectionCosts {
    }
  }

-  private ConnectionCosts() throws IOException {
-    this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
+  private static InputStream getClassResource() throws IOException {
+    final String resourcePath = ConnectionCosts.class.getSimpleName() + FILENAME_SUFFIX;
+    return IOUtils.requireResourceNonNull(
+        ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath);
  }

  public int get(int forwardId, int backwardId) {
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
@ -19,8 +19,13 @@ package org.apache.lucene.analysis.ko.dict;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.IOSupplier;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;

@ -35,7 +40,11 @@ public final class TokenInfoDictionary extends BinaryDictionary {
  private final TokenInfoFST fst;

  private TokenInfoDictionary() throws IOException {
-    this(ResourceScheme.CLASSPATH, null);
+    this(
+        () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
+        () -> getClassResource(POSDICT_FILENAME_SUFFIX),
+        () -> getClassResource(DICT_FILENAME_SUFFIX),
+        () -> getClassResource(FST_FILENAME_SUFFIX));
  }

  /**
@ -43,17 +52,64 @@ public final class TokenInfoDictionary extends BinaryDictionary {
   * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
   *     scheme only, use this class's name as the path.
   */
+  @Deprecated(forRemoval = true, since = "9.1")
+  @SuppressWarnings("removal")
  public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath)
      throws IOException {
-    super(resourceScheme, resourcePath);
+    this(
+        resourceScheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
+            : () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
+        resourceScheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
+            : () -> getClassResource(POSDICT_FILENAME_SUFFIX),
+        resourceScheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
+            : () -> getClassResource(DICT_FILENAME_SUFFIX),
+        resourceScheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX))
+            : () -> getClassResource(FST_FILENAME_SUFFIX));
+  }
+
+  /**
+   * Create a {@link TokenInfoDictionary} from an external resource path.
+   *
+   * @param targetMapFile where to load target map resource
+   * @param posDictFile where to load POS dictionary resource
+   * @param dictFile where to load dictionary entries resource
+   * @param fstFile where to load encoded FST data resource
+   * @throws IOException if resource was not found or broken
+   */
+  public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile)
+      throws IOException {
+    this(
+        () -> Files.newInputStream(targetMapFile),
+        () -> Files.newInputStream(posDictFile),
+        () -> Files.newInputStream(dictFile),
+        () -> Files.newInputStream(fstFile));
+  }
+
+  private TokenInfoDictionary(
+      IOSupplier<InputStream> targetMapResource,
+      IOSupplier<InputStream> posResource,
+      IOSupplier<InputStream> dictResource,
+      IOSupplier<InputStream> fstResource)
+      throws IOException {
+    super(targetMapResource, posResource, dictResource);
    FST<Long> fst;
-    try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
+    try (InputStream is = new BufferedInputStream(fstResource.get())) {
      DataInput in = new InputStreamDataInput(is);
      fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
    }
    this.fst = new TokenInfoFST(fst);
  }

+  private static InputStream getClassResource(String suffix) throws IOException {
+    final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
+    return IOUtils.requireResourceNonNull(
+        TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);
+  }
+
  public TokenInfoFST getFST() {
    return fst;
  }
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java
@ -17,6 +17,11 @@
 package org.apache.lucene.analysis.ko.dict;

 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import org.apache.lucene.util.IOUtils;

 /** Dictionary for unknown-word handling. */
 public final class UnknownDictionary extends BinaryDictionary {
@ -27,12 +32,47 @@ public final class UnknownDictionary extends BinaryDictionary {
   * @param resourcePath where to load resources from; a path, including the file base name without
   *     extension; this is used to match multiple files with the same base name.
   */
+  @Deprecated(forRemoval = true, since = "9.1")
+  @SuppressWarnings("removal")
  public UnknownDictionary(ResourceScheme scheme, String resourcePath) throws IOException {
-    super(scheme, resourcePath);
+    super(
+        scheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
+            : () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
+        scheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
+            : () -> getClassResource(POSDICT_FILENAME_SUFFIX),
+        scheme == ResourceScheme.FILE
+            ? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
+            : () -> getClassResource(DICT_FILENAME_SUFFIX));
+  }
+
+  /**
+   * Create a {@link UnknownDictionary} from an external resource path.
+   *
+   * @param targetMapFile where to load target map resource
+   * @param posDictFile where to load POS dictionary resource
+   * @param dictFile where to load dictionary entries resource
+   * @throws IOException if resource was not found or broken
+   */
+  public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException {
+    super(
+        () -> Files.newInputStream(targetMapFile),
+        () -> Files.newInputStream(posDictFile),
+        () -> Files.newInputStream(dictFile));
  }

  private UnknownDictionary() throws IOException {
-    super();
+    super(
+        () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
+        () -> getClassResource(POSDICT_FILENAME_SUFFIX),
+        () -> getClassResource(DICT_FILENAME_SUFFIX));
+  }
+
+  private static InputStream getClassResource(String suffix) throws IOException {
+    final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix;
+    return IOUtils.requireResourceNonNull(
+        UnknownDictionary.class.getResourceAsStream(resourcePath), resourcePath);
  }

  public CharacterDefinition getCharacterDefinition() {
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@ -473,6 +473,7 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
  }

  // Make sure loading custom dictionaries from classpath works:
+  @SuppressWarnings("removal")
  public void testCustomDictionary() throws Exception {
    Tokenizer tokenizer =
        new KoreanTokenizer(
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java
@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.dict;
+
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX;
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
+import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX;
+
+import java.io.BufferedWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.junit.Before;
+
+public class TestExternalDictionary extends LuceneTestCase {
+
+  private Path dir;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    dir = createTempDir("systemDict");
+    try (BufferedWriter writer =
+        Files.newBufferedWriter(dir.resolve("unk.def"), StandardCharsets.UTF_8)) {
+      writer.write("DEFAULT,1798,3559,3677,SY,*,*,*,*,*,*,*");
+      writer.newLine();
+      writer.write("SPACE,1795,3556,1065,SP,*,*,*,*,*,*,*");
+      writer.newLine();
+    }
+    try (BufferedWriter writer =
+        Files.newBufferedWriter(dir.resolve("char.def"), StandardCharsets.UTF_8)) {
+      writer.write("0x0021..0x002F SYMBOL");
+      writer.newLine();
+      writer.write("0x0030..0x0039 NUMERIC");
+      writer.newLine();
+    }
+    try (BufferedWriter writer =
+        Files.newBufferedWriter(dir.resolve("matrix.def"), StandardCharsets.UTF_8)) {
+      writer.write("3 3");
+      writer.newLine();
+      writer.write("1 1 0");
+      writer.newLine();
+      writer.write("1 2 0");
+      writer.newLine();
+    }
+    try (BufferedWriter writer =
+        Files.newBufferedWriter(dir.resolve("noun.csv"), StandardCharsets.UTF_8)) {
+      writer.write("명사,1,1,2,NNG,*,*,*,*,*,*,*");
+      writer.newLine();
+      writer.write("일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
+      writer.newLine();
+    }
+    DictionaryBuilder.build(dir, dir, "utf-8", true);
+  }
+
+  public void testLoadExternalTokenInfoDictionary() throws Exception {
+    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
+    TokenInfoDictionary dict =
+        new TokenInfoDictionary(
+            dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
+            dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX),
+            dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX),
+            dir.resolve(dictionaryPath + FST_FILENAME_SUFFIX));
+    assertNotNull(dict.getFST());
+  }
+
+  public void testLoadExternalUnknownDictionary() throws Exception {
+    String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
+    UnknownDictionary dict =
+        new UnknownDictionary(
+            dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
+            dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX),
+            dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX));
+    assertNotNull(dict.getCharacterDefinition());
+  }
+
+  public void testLoadExternalConnectionCosts() throws Exception {
+    String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
+    ConnectionCosts cc =
+        new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
+    assertEquals(0, cc.get(1, 1));
+  }
+}
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
@ -16,7 +16,10 @@
 */
 package org.apache.lucene.analysis.ko.dict;

-import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX;
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
+import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX;

 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
@ -76,7 +79,11 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
    DictionaryBuilder.build(dir, dir, "utf-8", true);
    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
    // We must also load the other files (in BinaryDictionary) from the correct path
-    return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
+    return new TokenInfoDictionary(
+        dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
+        dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX),
+        dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX),
+        dir.resolve(dictionaryPath + FST_FILENAME_SUFFIX));
  }

  public void testPutException() {