LUCENE-8934: promote nori tools to main jar

2019-07-31 01:14:04 +09:00 · 2019-07-31 01:14:04 +09:00 · 2c0d8996cf
parent 254a17b3b0
commit 2c0d8996cf
18 changed files with 409 additions and 558 deletions
--- a/lucene/analysis/nori/build.xml
+++ b/lucene/analysis/nori/build.xml
@ -26,7 +26,6 @@
  <!-- currently whether rat detects this as binary or not
       is platform dependent?! -->
  <property name="rat.excludes" value="**/*.txt,**/bocchan.utf-8"/>
  <property name="rat.additional-includes" value="src/tools/**"/>
  <!-- we don't want to pull in ipadic/naist etc -->
  <property name="ivy.default.configuration" value="default"/>
@ -45,6 +44,9 @@
  <available type="dir" file="${build.dir}/${dict.version}" property="mecab-ko.dict.available"/>
  <path id="classpath">
    <dirset dir="${build.dir}">
      <include name="classes/java"/>
    </dirset>
    <pathelement path="${analyzers-common.jar}"/>
    <path refid="base.classpath"/>
  </path>
@ -57,28 +59,14 @@
    <untar src="${build.dir}/${dict.version}.tar" dest="${build.dir}"/>
  </target>
-  <path id="tools.classpath">
+  <target name="build-dict" depends="compile, download-dict">
    <path refid="classpath"/>
    <pathelement location="${build.dir}/classes/java"/>
    <pathelement location="${build.dir}/classes/tools"/>
  </path>
  <path id="tools.test.classpath">
    <path refid="tools.classpath"/>
    <path refid="test.base.classpath"/>
    <pathelement location="${build.dir}/classes/tools-test"/>
  </path>
  <target name="build-dict" depends="compile-tools, download-dict">
    <sequential>
      <delete verbose="true">
        <fileset dir="${resources.dir}/org/apache/lucene/analysis/ko/dict" includes="**/*"/>
      </delete>
      <!-- TODO: optimize the dictionary construction a bit so that you don't need 1G -->
      <java fork="true" failonerror="true" maxmemory="1g" classname="org.apache.lucene.analysis.ko.util.DictionaryBuilder">
-        <classpath>
+        <classpath refid="classpath"/>
          <path refid="tools.classpath"/>
        </classpath>
        <assertions>
          <enable package="org.apache.lucene"/>
        </assertions>
@ -90,34 +78,7 @@
    </sequential>
  </target>
-  <target name="compile-tools" depends="compile-core, common.compile-tools">
+  <target name="compile-test" depends="module-build.compile-test"/>
    <compile
        srcdir="src/tools/java"
        destdir="${build.dir}/classes/tools">
      <classpath>
        <path refid="tools.classpath"/>
      </classpath>
    </compile>
  </target>
  <target name="compile-tools-tests" depends="compile-tools">
    <compile
        srcdir="src/tools/test"
        destdir="${build.dir}/classes/tools-test">
      <classpath>
        <path refid="tools.test.classpath"/>
        <pathelement path="src/tools/test"/>
      </classpath>
    </compile>
  </target>
  <target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
    <test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
  </target>
  <target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
  <!-- TODO: not until we properly make 'test-tools' work with clover etc
  <target name="test" depends="module-build.test, test-tools"/> -->
  <target name="regenerate" depends="build-dict"/>
 </project>
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
@ -21,7 +21,6 @@ import java.io.InputStream;
 import java.io.IOException;
 import org.apache.lucene.store.InputStreamDataInput;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -46,20 +45,9 @@ public final class TokenInfoDictionary extends BinaryDictionary {
   */
  TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
    super(resourceScheme, resourcePath);
    InputStream is = null;
    FST<Long> fst;
-    boolean success = false;
+    try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
    try {
      is = getResource(FST_FILENAME_SUFFIX);
      is = new BufferedInputStream(is);
      fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
      success = true;
    } finally {
      if (success) {
        IOUtils.close(is);
      } else {
        IOUtils.closeWhileHandlingException(is);
      }
    }
    this.fst = new TokenInfoFST(fst);
  }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
@ -17,13 +17,13 @@
 package org.apache.lucene.analysis.ko.util;
 import java.io.BufferedOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@ -37,17 +37,17 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.analysis.ko.dict.BinaryDictionary;
-public abstract class BinaryDictionaryWriter {
+abstract class BinaryDictionaryWriter {
  private final static int ID_LIMIT = 8192;
-  protected final Class<? extends BinaryDictionary> implClazz;
+  private final Class<? extends BinaryDictionary> implClazz;
  protected ByteBuffer buffer;
  private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
  private int[] targetMap = new int[8192];
  private int[] targetMapOffsets = new int[8192];
  private final ArrayList<String> posDict = new ArrayList<>();
-  public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
+  BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
    this.implClazz = implClazz;
    buffer = ByteBuffer.allocate(size);
  }
@ -183,7 +183,7 @@ public abstract class BinaryDictionaryWriter {
    }
  }
-  public void addMapping(int sourceId, int wordId) {
+  void addMapping(int sourceId, int wordId) {
    if (wordId <= lastWordId) {
      throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
    }
@ -205,27 +205,26 @@ public abstract class BinaryDictionaryWriter {
    lastWordId = wordId;
  }
-  protected final String getBaseFileName(String baseDir) {
+  final String getBaseFileName() {
-    return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
+    return implClazz.getName().replace('.', '/');
  }
  /**
   * Write dictionary in file
   * @throws IOException if an I/O error occurs writing the dictionary files
   */
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
-    final String baseName = getBaseFileName(baseDir);
+    final String baseName = getBaseFileName();
-    writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
+    writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX));
-    writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+    writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX));
-    writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+    writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX));
  }
-  protected void writeTargetMap(String filename) throws IOException {
+  private void writeTargetMap(Path path) throws IOException {
-    new File(filename).getParentFile().mkdirs();
+    Files.createDirectories(path.getParent());
-    OutputStream os = new FileOutputStream(filename);
+    try (OutputStream os = Files.newOutputStream(path);
-    try {
+         OutputStream bos = new BufferedOutputStream(os)) {
-      os = new BufferedOutputStream(os);
+      final DataOutput out = new OutputStreamDataOutput(bos);
      final DataOutput out = new OutputStreamDataOutput(os);
      CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
      final int numSourceIds = lastSourceId + 1;
@ -246,17 +245,14 @@ public abstract class BinaryDictionaryWriter {
      if (sourceId != numSourceIds) {
        throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
      }
    } finally {
      os.close();
    }
  }
-  protected void writePosDict(String filename) throws IOException {
+  private void writePosDict(Path path) throws IOException {
-    new File(filename).getParentFile().mkdirs();
+    Files.createDirectories(path.getParent());
-    OutputStream os = new FileOutputStream(filename);
+    try (OutputStream os = Files.newOutputStream(path);
-    try {
+         OutputStream bos = new BufferedOutputStream(os)) {
-      os = new BufferedOutputStream(os);
+      final DataOutput out = new OutputStreamDataOutput(bos);
      final DataOutput out = new OutputStreamDataOutput(os);
      CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
      out.writeVInt(posDict.size());
      for (String s : posDict) {
@ -270,25 +266,21 @@ public abstract class BinaryDictionaryWriter {
          out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal());
        }
      }
    } finally {
      os.close();
    }
  }
-  protected void writeDictionary(String filename) throws IOException {
+  private void writeDictionary(Path path) throws IOException {
-    new File(filename).getParentFile().mkdirs();
+    Files.createDirectories(path.getParent());
-    final FileOutputStream os = new FileOutputStream(filename);
+    try (OutputStream os = Files.newOutputStream(path);
-    try {
+         OutputStream bos = new BufferedOutputStream(os)) {
-      final DataOutput out = new OutputStreamDataOutput(os);
+      final DataOutput out = new OutputStreamDataOutput(bos);
      CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
      out.writeVInt(buffer.position());
-      final WritableByteChannel channel = Channels.newChannel(os);
+      final WritableByteChannel channel = Channels.newChannel(bos);
      // Write Buffer
      buffer.flip();  // set position to 0, set limit to current position
      channel.write(buffer);
      assert buffer.remaining() == 0L;
    } finally {
      os.close();
    }
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
@ -17,10 +17,10 @@
 package org.apache.lucene.analysis.ko.util;
 import java.io.BufferedOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Arrays;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
@ -29,7 +29,7 @@ import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.OutputStreamDataOutput;
-public final class CharacterDefinitionWriter {
+final class CharacterDefinitionWriter {
  private final byte[] characterCategoryMap = new byte[0x10000];
@ -39,7 +39,7 @@ public final class CharacterDefinitionWriter {
  /**
   * Constructor for building. TODO: remove write access
   */
-  public CharacterDefinitionWriter() {
+  CharacterDefinitionWriter() {
    Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
  }
@ -50,7 +50,7 @@ public final class CharacterDefinitionWriter {
   *            code point
   * @param characterClassName character class name
   */
-  public void putCharacterCategory(int codePoint, String characterClassName) {
+  void putCharacterCategory(int codePoint, String characterClassName) {
    characterClassName = characterClassName.split(" ")[0]; // use first
    // category
    // class
@ -62,20 +62,17 @@ public final class CharacterDefinitionWriter {
    characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
  }
-  public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+  void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
    final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
    invokeMap[characterClass] = invoke == 1;
    groupMap[characterClass] = group == 1;
    // TODO: length def ignored
  }
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
-    String filename = baseDir + File.separator +
+    Path path = baseDir.resolve(CharacterDefinition.class.getName().replace('.', '/') + CharacterDefinition.FILENAME_SUFFIX);
-      CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX;
+    Files.createDirectories(path.getParent());
-    new File(filename).getParentFile().mkdirs();
+    try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))){
    OutputStream os = new FileOutputStream(filename);
    try {
      os = new BufferedOutputStream(os);
      final DataOutput out = new OutputStreamDataOutput(os);
      CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
      out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
@ -86,8 +83,6 @@ public final class CharacterDefinitionWriter {
        );
        out.writeByte(b);
      }
    } finally {
      os.close();
    }
  }
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
@ -0,0 +1,61 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.util;
 import java.io.IOException;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 class ConnectionCostsBuilder {
  private ConnectionCostsBuilder() {
  }
  public static ConnectionCostsWriter build(Path path) throws IOException {
    try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII);
         LineNumberReader lineReader = new LineNumberReader(reader)) {
      String line = lineReader.readLine();
      String[] dimensions = line.split("\\s+");
      assert dimensions.length == 2;
      int forwardSize = Integer.parseInt(dimensions[0]);
      int backwardSize = Integer.parseInt(dimensions[1]);
      assert forwardSize > 0 && backwardSize > 0;
      ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
      while ((line = lineReader.readLine()) != null) {
        String[] fields = line.split("\\s+");
        assert fields.length == 3;
        int forwardId = Integer.parseInt(fields[0]);
        int backwardId = Integer.parseInt(fields[1]);
        int cost = Integer.parseInt(fields[2]);
        costs.add(forwardId, backwardId, cost);
      }
      return costs;
    }
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
@ -17,10 +17,10 @@
 package org.apache.lucene.analysis.ko.util;
 import java.io.BufferedOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
@ -28,7 +28,7 @@ import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.OutputStreamDataOutput;
-public final class ConnectionCostsWriter {
+final class ConnectionCostsWriter {
  private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
  private final int forwardSize;
@ -36,7 +36,7 @@ public final class ConnectionCostsWriter {
  /**
   * Constructor for building. TODO: remove write access
   */
-  public ConnectionCostsWriter(int forwardSize, int backwardSize) {
+  ConnectionCostsWriter(int forwardSize, int backwardSize) {
    this.forwardSize = forwardSize;
    this.backwardSize = backwardSize;
    this.costs = new short[backwardSize][forwardSize];
@ -46,14 +46,12 @@ public final class ConnectionCostsWriter {
    this.costs[backwardId][forwardId] = (short)cost;
  }
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
-    String filename = baseDir + File.separator +
+    Files.createDirectories(baseDir);
-      ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX;
+    String fileName = ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX;
-    new File(filename).getParentFile().mkdirs();
+    try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName));
-    OutputStream os = new FileOutputStream(filename);
+         OutputStream bos = new BufferedOutputStream(os)) {
-    try {
+      final DataOutput out = new OutputStreamDataOutput(bos);
      os = new BufferedOutputStream(os);
      final DataOutput out = new OutputStreamDataOutput(os);
      CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
      out.writeVInt(forwardSize);
      out.writeVInt(backwardSize);
@ -61,14 +59,12 @@ public final class ConnectionCostsWriter {
      assert costs.length == backwardSize;
      for (short[] a : costs) {
        assert a.length == forwardSize;
-        for (int i = 0; i < a.length; i++) {
+        for (short cost : a) {
-          int delta = (int)a[i] - last;
+          int delta = (int) cost - last;
          out.writeZInt(delta);
-          last = a[i];
+          last = cost;
        }
      }
    } finally {
      os.close();
    }
  }
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
@ -0,0 +1,54 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.util;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 /**
 * Tool to build dictionaries.
 */
 public class DictionaryBuilder {
  private DictionaryBuilder() {
  }
  public static void build(Path inputDir, Path outputDir, String encoding, boolean normalizeEntry) throws IOException {
    // Build TokenInfo Dictionary
    new TokenInfoDictionaryBuilder(encoding, normalizeEntry)
        .build(inputDir)
        .write(outputDir);
    // Build Unknown Word Dictionary
    new UnknownDictionaryBuilder(encoding)
        .build(inputDir)
        .write(outputDir);
    // Build Connection Cost
    ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
        .write(outputDir);
  }
  public static void main(String[] args) throws IOException {
    String inputDirname = args[0];
    String outputDirname = args[1];
    String inputEncoding = args[2];
    boolean normalizeEntries = Boolean.parseBoolean(args[3]);
    DictionaryBuilder.build(Paths.get(inputDirname), Paths.get(outputDirname), inputEncoding, normalizeEntries);
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
@ -17,20 +17,17 @@
 package org.apache.lucene.analysis.ko.util;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
+import java.nio.file.Files;
-import java.nio.charset.CodingErrorAction;
+import java.nio.file.Path;
 import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.Builder;
@ -38,72 +35,59 @@ import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
-public class TokenInfoDictionaryBuilder {
+class TokenInfoDictionaryBuilder {
  /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
  private int offset = 0;
-  private String encoding = "utf-8";
+  private String encoding;
  private Normalizer.Form normalForm;
-  public TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
+  TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
    this.encoding = encoding;
-    this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
+    normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
  }
-  public TokenInfoDictionaryWriter build(String dirname) throws IOException {
+  public TokenInfoDictionaryWriter build(Path dir) throws IOException {
-    FilenameFilter filter = (dir, name) -> name.endsWith(".csv");
+    try (Stream<Path> files = Files.list(dir)) {
-    ArrayList<File> csvFiles = new ArrayList<>();
+      List<Path> csvFiles = files
-    for (File file : new File(dirname).listFiles(filter)) {
+          .filter(path -> path.getFileName().toString().endsWith(".csv"))
-      csvFiles.add(file);
+          .sorted()
          .collect(Collectors.toList());
      return buildDictionary(csvFiles);
    }
    Collections.sort(csvFiles);
    return buildDictionary(csvFiles);
  }
-  public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
+  private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
    // all lines in the file
    System.out.println("  parse...");
    List<String[]> lines = new ArrayList<>(400000);
-    for (File file : csvFiles){
+    for (Path path : csvFiles) {
-      FileInputStream inputStream = new FileInputStream(file);
+      try (BufferedReader reader = Files.newBufferedReader(path, Charset.forName(encoding))) {
-      Charset cs = Charset.forName(encoding);
+        String line;
-      CharsetDecoder decoder = cs.newDecoder()
+        while ((line = reader.readLine()) != null) {
-          .onMalformedInput(CodingErrorAction.REPORT)
+          String[] entry = CSVUtil.parse(line);
          .onUnmappableCharacter(CodingErrorAction.REPORT);
      InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
      BufferedReader reader = new BufferedReader(streamReader);
-      String line = null;
+          if (entry.length < 12) {
-      while ((line = reader.readLine()) != null) {
+            throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
-        String[] entry = CSVUtil.parse(line);
+          }
-
+
-        if(entry.length < 12) {
+          // NFKC normalize dictionary entry
-          throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
+          if (normalForm != null) {
-        }
+            String[] normalizedEntry = new String[entry.length];
-
+            for (int i = 0; i < entry.length; i++) {
-        // NFKC normalize dictionary entry
+              normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
-        if (normalForm != null) {
+            }
-          String[] normalizedEntry = new String[entry.length];
+            lines.add(normalizedEntry);
-          for (int i = 0; i < entry.length; i++) {
+          } else {
-            normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
+            lines.add(entry);
          }
          lines.add(normalizedEntry);
        } else {
          lines.add(entry);
        }
      }
    }
    System.out.println("  sort...");
    // sort by term: we sorted the files already and use a stable sort.
-    Collections.sort(lines, Comparator.comparing(left -> left[0]));
+    lines.sort(Comparator.comparing(left -> left[0]));
    System.out.println("  encode...");
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
@ -111,7 +95,7 @@ public class TokenInfoDictionaryBuilder {
    long ord = -1; // first ord will be 0
    String lastValue = null;
-    // build tokeninfo dictionary
+    // build token info dictionary
    for (String[] entry : lines) {
      String surfaceForm = entry[0].trim();
      if (surfaceForm.isEmpty()) {
@ -119,9 +103,8 @@ public class TokenInfoDictionaryBuilder {
      }
      int next = dictionary.put(entry);
-      if(next == offset){
+      if(next == offset) {
-        System.out.println("Failed to process line: " + Arrays.toString(entry));
+        throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
        continue;
      }
      if (!surfaceForm.equals(lastValue)) {
@ -135,16 +118,10 @@ public class TokenInfoDictionaryBuilder {
        }
        fstBuilder.add(scratch.get(), ord);
      }
-      dictionary.addMapping((int)ord, offset);
+      dictionary.addMapping((int) ord, offset);
      offset = next;
    }
-
+    dictionary.setFST(fstBuilder.finish());
    final FST<Long> fst = fstBuilder.finish();
    System.out.print("  " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes...  ");
    dictionary.setFST(fst);
    System.out.println(" done");
    return dictionary;
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
@ -19,31 +19,31 @@ package org.apache.lucene.analysis.ko.util;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.util.Objects;
 import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
 import org.apache.lucene.util.fst.FST;
-public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
+class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
  private FST<Long> fst;
-  public TokenInfoDictionaryWriter(int size) {
+  TokenInfoDictionaryWriter(int size) {
    super(TokenInfoDictionary.class, size);
  }
  public void setFST(FST<Long> fst) {
    Objects.requireNonNull(fst, "dictionary must not be empty");
    this.fst = fst;
  }
  @Override
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
    super.write(baseDir);
-    writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX);
+    writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX));
  }
-  protected void writeFST(String filename) throws IOException {
+  private void writeFST(Path path) throws IOException {
-    Path p = Paths.get(filename);
+    Files.createDirectories(path.getParent());
-    Files.createDirectories(p.getParent());
+    fst.save(path);
    fst.save(p);
  }  
 }
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
@ -0,0 +1,118 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.util;
 import java.io.IOException;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
  private String encoding;
  UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }
  public UnknownDictionaryWriter build(Path dir) throws IOException {
    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def"));  //Should be only one file
    readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
    return unkDictionary;
  }
  private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
    return readDictionaryFile(path, encoding);
  }
  private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
    List<String[]> lines = new ArrayList<>();
    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
         LineNumberReader lineReader = new LineNumberReader(reader)) {
      dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
      String line;
      while ((line = lineReader.readLine()) != null) {
        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
        // even though the unknown dictionary returns hardcoded null here.
        final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
        lines.add(parsed);
      }
    }
    lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));
    for (String[] entry : lines) {
      dictionary.put(entry);
    }
    return dictionary;
  }
  private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
         LineNumberReader lineReader = new LineNumberReader(reader)) {
      String line;
      while ((line = lineReader.readLine()) != null) {
        line = line.replaceAll("^\\s", "");
        line = line.replaceAll("\\s*#.*", "");
        line = line.replaceAll("\\s+", " ");
        // Skip empty line or comment line
        if (line.length() == 0) {
          continue;
        }
        if (line.startsWith("0x")) {  // Category mapping
          String[] values = line.split(" ", 2);  // Split only first space
          if (!values[0].contains("..")) {
            int cp = Integer.decode(values[0]);
            dictionary.putCharacterCategory(cp, values[1]);
          } else {
            String[] codePoints = values[0].split("\\.\\.");
            int cpFrom = Integer.decode(codePoints[0]);
            int cpTo = Integer.decode(codePoints[1]);
            for (int i = cpFrom; i <= cpTo; i++) {
              dictionary.putCharacterCategory(i, values[1]);
            }
          }
        } else {  // Invoke definition
          String[] values = line.split(" "); // Consecutive space is merged above
          String characterClassName = values[0];
          int invoke = Integer.parseInt(values[1]);
          int group = Integer.parseInt(values[2]);
          int length = Integer.parseInt(values[3]);
          dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
        }
      }
    }
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
@ -17,11 +17,12 @@
 package org.apache.lucene.analysis.ko.util;
 import java.io.IOException;
 import java.nio.file.Path;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
-public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
+class UnknownDictionaryWriter extends BinaryDictionaryWriter {
  private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter();
@ -58,7 +59,7 @@ public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
  }
  @Override
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
    super.write(baseDir);
    characterDefinition.write(baseDir);
  }
--- a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
+++ b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
@ -16,15 +16,74 @@
 */
 package org.apache.lucene.analysis.ko.dict;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.lucene.analysis.ko.POS;
 import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntsRefFSTEnum;
 import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
-public class TestTokenInfoDictionary extends LuceneTestCase {
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
 /**
 * Tests of TokenInfoDictionary build tools; run using ant test-tools
 */
 public class TokenInfoDictionaryTest extends LuceneTestCase {
  public void testPut() throws Exception {
    TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
        // "large" id
        "일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
    IntsRef wordIdRef = new IntsRefBuilder().get();
    dict.lookupWordIds(0, wordIdRef);
    int wordId = wordIdRef.ints[wordIdRef.offset];
    assertEquals(1, dict.getLeftId(wordId));
    assertEquals(1, dict.getRightId(wordId));
    assertEquals(2, dict.getWordCost(wordId));
    dict.lookupWordIds(1, wordIdRef);
    wordId = wordIdRef.ints[wordIdRef.offset];
    assertEquals(5000, dict.getLeftId(wordId));
    assertEquals(5000, dict.getRightId(wordId));
    assertEquals(3, dict.getWordCost(wordId));
  }
  private TokenInfoDictionary newDictionary(String... entries) throws Exception {
    Path dir = createTempDir();
    try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
      for (String entry : entries) {
        printer.println(entry);
      }
    }
    Files.createFile(dir.resolve("unk.def"));
    Files.createFile(dir.resolve("char.def"));
    try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def"));
         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
      printer.println("1 1");
    }
    DictionaryBuilder.build(dir, dir, "utf-8", true);
    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
    // We must also load the other files (in BinaryDictionary) from the correct path
    return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
  }
  public void testPutException() {
    //too few columns
    expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
    // id too large
    expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
  }
  /** enumerates the entire FST/lookup data and just does basic sanity checks */
  public void testEnumerateAll() throws Exception {
@ -38,12 +97,12 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
    ConnectionCosts matrix = ConnectionCosts.getInstance();
    FST<Long> fst = tid.getFST().getInternalFST();
    IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
-    InputOutput<Long> mapping;
+    IntsRefFSTEnum.InputOutput<Long> mapping;
    IntsRef scratch = new IntsRef();
    while ((mapping = fstEnum.next()) != null) {
      numTerms++;
      IntsRef input = mapping.input;
-      char chars[] = new char[input.length];
+      char[] chars = new char[input.length];
      for (int i = 0; i < chars.length; i++) {
        chars[i] = (char)input.ints[input.offset+i];
      }
--- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java
+++ b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java
@ -14,11 +14,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.analysis.ko.dict;
+package org.apache.lucene.analysis.ko.util;
 import org.apache.lucene.analysis.ko.util.CSVUtil;
 import org.apache.lucene.analysis.ko.util.UnknownDictionaryWriter;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
@ -1,67 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.util;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 public class ConnectionCostsBuilder {
  private ConnectionCostsBuilder() {
  }
  public static ConnectionCostsWriter build(String filename) throws IOException {
    FileInputStream inputStream = new FileInputStream(filename);
    Charset cs = StandardCharsets.US_ASCII;
    CharsetDecoder decoder = cs.newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
    String line = lineReader.readLine();
    String[] dimensions = line.split("\\s+");
    assert dimensions.length == 2;
    int forwardSize = Integer.parseInt(dimensions[0]);
    int backwardSize = Integer.parseInt(dimensions[1]);
    assert forwardSize > 0 && backwardSize > 0;
    ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
    while ((line = lineReader.readLine()) != null) {
      String[] fields = line.split("\\s+");
      assert fields.length == 3;
      int forwardId = Integer.parseInt(fields[0]);
      int backwardId = Integer.parseInt(fields[1]);
      int cost = Integer.parseInt(fields[2]);
      costs.add(forwardId, backwardId, cost);
    }
    return costs;
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
@ -1,67 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.util;
 import java.io.File;
 import java.io.IOException;
 public class DictionaryBuilder {
  private DictionaryBuilder() {
  }
  public static void build(String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException {
    System.out.println("building tokeninfo dict...");
    TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(encoding, normalizeEntry);
    TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
    tokenInfoDictionary.write(outputDirname);
    tokenInfoDictionary = null;
    tokenInfoBuilder = null;
    System.out.println("done");
    System.out.print("building unknown word dict...");
    UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
    UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
    unkDictionary.write(outputDirname);
    unkDictionary = null;
    unkBuilder = null;
    System.out.println("done");
    System.out.print("building connection costs...");
    ConnectionCostsWriter connectionCosts
      = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
    connectionCosts.write(outputDirname);
    System.out.println("done");
  }
  public static void main(String[] args) throws IOException {
    String inputDirname = args[0];
    String outputDirname = args[1];
    String inputEncoding = args[2];
    boolean normalizeEntries = Boolean.parseBoolean(args[3]);
    System.out.println("dictionary builder");
    System.out.println("");
    System.out.println("input directory: " + inputDirname);
    System.out.println("output directory: " + outputDirname);
    System.out.println("input encoding: " + inputEncoding);
    System.out.println("normalize entries: " + normalizeEntries);
    System.out.println("");
    DictionaryBuilder.build(inputDirname, outputDirname, inputEncoding, normalizeEntries);
  }
 }
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
@ -1,134 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.util;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 public class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
  private String encoding = "utf-8";
  public UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }
  public UnknownDictionaryWriter build(String dirname) throws IOException {
    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
    readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
    return unkDictionary;
  }
  public UnknownDictionaryWriter readDictionaryFile(String filename)
      throws IOException {
    return readDictionaryFile(filename, encoding);
  }
  public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
      throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
    FileInputStream inputStream = new FileInputStream(filename);
    Charset cs = Charset.forName(encoding);
    CharsetDecoder decoder = cs.newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
    List<String[]> lines = new ArrayList<>();
    String line = null;
    while ((line = lineReader.readLine()) != null) {
      // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
      // even though the unknown dictionary returns hardcoded null here.
      final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
      lines.add(parsed);
    }
    Collections.sort(lines, new Comparator<String[]>() {
      public int compare(String[] left, String[] right) {
        int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
        int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
        return leftId - rightId;
      }
    });
    for (String[] entry : lines) {
      dictionary.put(entry);
    }
    return dictionary;
  }
  public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
    FileInputStream inputStream = new FileInputStream(filename);
    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
    String line = null;
    while ((line = lineReader.readLine()) != null) {
      line = line.replaceAll("^\\s", "");
      line = line.replaceAll("\\s*#.*", "");
      line = line.replaceAll("\\s+", " ");
      // Skip empty line or comment line
      if(line.length() == 0) {
        continue;
      }
      if(line.startsWith("0x")) {  // Category mapping
        String[] values = line.split(" ", 2);  // Split only first space
        if(!values[0].contains("..")) {
          int cp = Integer.decode(values[0]).intValue();
          dictionary.putCharacterCategory(cp, values[1]);
        } else {
          String[] codePoints = values[0].split("\\.\\.");
          int cpFrom = Integer.decode(codePoints[0]).intValue();
          int cpTo = Integer.decode(codePoints[1]).intValue();
          for(int i = cpFrom; i <= cpTo; i++){
            dictionary.putCharacterCategory(i, values[1]);
          }
        }
      } else {  // Invoke definition
        String[] values = line.split(" "); // Consecutive space is merged above
        String characterClassName = values[0];
        int invoke = Integer.parseInt(values[1]);
        int group = Integer.parseInt(values[2]);
        int length = Integer.parseInt(values[3]);
        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
      }
    }
  }
 }
--- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
+++ b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
@ -1,80 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ko.dict;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryBuilder;
 import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryWriter;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import static java.io.File.separatorChar;
 import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
 /**
 * Tests of TokenInfoDictionary build tools; run using ant test-tools
 */
 public class TokenInfoDictionaryTest extends LuceneTestCase {
  public void testPut() throws Exception {
    TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
        // "large" id
        "일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
    IntsRef wordIdRef = new IntsRefBuilder().get();
    dict.lookupWordIds(0, wordIdRef);
    int wordId = wordIdRef.ints[wordIdRef.offset];
    assertEquals(1, dict.getLeftId(wordId));
    assertEquals(1, dict.getRightId(wordId));
    assertEquals(2, dict.getWordCost(wordId));
    dict.lookupWordIds(1, wordIdRef);
    wordId = wordIdRef.ints[wordIdRef.offset];
    assertEquals(5000, dict.getLeftId(wordId));
    assertEquals(5000, dict.getRightId(wordId));
    assertEquals(3, dict.getWordCost(wordId));
  }
  private TokenInfoDictionary newDictionary(String... entries) throws Exception {
    Path dir = createTempDir();
    try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
      for (String entry : entries) {
        printer.println(entry);
      }
    }
    TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder("utf-8", true);
    TokenInfoDictionaryWriter writer = builder.build(dir.toString());
    writer.write(dir.toString());
    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
    // We must also load the other files (in BinaryDictionary) from the correct path
    return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
  }
  public void testPutException() throws Exception {
    // too few columns
    expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
    // id too large
    expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
  }
 }