diff --git a/lucene/analysis/nori/build.xml b/lucene/analysis/nori/build.xml
index 6b82816b807..7d5b0b99f41 100644
--- a/lucene/analysis/nori/build.xml
+++ b/lucene/analysis/nori/build.xml
@@ -26,7 +26,6 @@
-
@@ -45,6 +44,9 @@
+
+
+
@@ -57,28 +59,14 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
+
@@ -90,34 +78,7 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
index 7c645a5cc11..02481e12e0e 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java
@@ -21,7 +21,6 @@ import java.io.InputStream;
import java.io.IOException;
import org.apache.lucene.store.InputStreamDataInput;
-import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -46,20 +45,9 @@ public final class TokenInfoDictionary extends BinaryDictionary {
*/
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
super(resourceScheme, resourcePath);
- InputStream is = null;
FST fst;
- boolean success = false;
- try {
- is = getResource(FST_FILENAME_SUFFIX);
- is = new BufferedInputStream(is);
+ try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
- success = true;
- } finally {
- if (success) {
- IOUtils.close(is);
- } else {
- IOUtils.closeWhileHandlingException(is);
- }
}
this.fst = new TokenInfoFST(fst);
}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
similarity index 83%
rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
index db57d4fd66f..6a19b1b56a8 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
@@ -17,13 +17,13 @@
package org.apache.lucene.analysis.ko.util;
import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -37,17 +37,17 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.analysis.ko.dict.BinaryDictionary;
-public abstract class BinaryDictionaryWriter {
+abstract class BinaryDictionaryWriter {
private final static int ID_LIMIT = 8192;
- protected final Class extends BinaryDictionary> implClazz;
+ private final Class extends BinaryDictionary> implClazz;
protected ByteBuffer buffer;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
private final ArrayList posDict = new ArrayList<>();
- public BinaryDictionaryWriter(Class extends BinaryDictionary> implClazz, int size) {
+ BinaryDictionaryWriter(Class extends BinaryDictionary> implClazz, int size) {
this.implClazz = implClazz;
buffer = ByteBuffer.allocate(size);
}
@@ -183,7 +183,7 @@ public abstract class BinaryDictionaryWriter {
}
}
- public void addMapping(int sourceId, int wordId) {
+ void addMapping(int sourceId, int wordId) {
if (wordId <= lastWordId) {
throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
}
@@ -205,27 +205,26 @@ public abstract class BinaryDictionaryWriter {
lastWordId = wordId;
}
- protected final String getBaseFileName(String baseDir) {
- return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
+ final String getBaseFileName() {
+ return implClazz.getName().replace('.', '/');
}
/**
* Write dictionary in file
* @throws IOException if an I/O error occurs writing the dictionary files
*/
- public void write(String baseDir) throws IOException {
- final String baseName = getBaseFileName(baseDir);
- writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
- writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
- writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+ public void write(Path baseDir) throws IOException {
+ final String baseName = getBaseFileName();
+ writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX));
+ writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX));
+ writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX));
}
- protected void writeTargetMap(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
- final DataOutput out = new OutputStreamDataOutput(os);
+ private void writeTargetMap(Path path) throws IOException {
+ Files.createDirectories(path.getParent());
+ try (OutputStream os = Files.newOutputStream(path);
+ OutputStream bos = new BufferedOutputStream(os)) {
+ final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
final int numSourceIds = lastSourceId + 1;
@@ -246,17 +245,14 @@ public abstract class BinaryDictionaryWriter {
if (sourceId != numSourceIds) {
throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
}
- } finally {
- os.close();
}
}
- protected void writePosDict(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
- final DataOutput out = new OutputStreamDataOutput(os);
+ private void writePosDict(Path path) throws IOException {
+ Files.createDirectories(path.getParent());
+ try (OutputStream os = Files.newOutputStream(path);
+ OutputStream bos = new BufferedOutputStream(os)) {
+ final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(posDict.size());
for (String s : posDict) {
@@ -270,25 +266,21 @@ public abstract class BinaryDictionaryWriter {
out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal());
}
}
- } finally {
- os.close();
}
}
- protected void writeDictionary(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- final FileOutputStream os = new FileOutputStream(filename);
- try {
- final DataOutput out = new OutputStreamDataOutput(os);
+ private void writeDictionary(Path path) throws IOException {
+ Files.createDirectories(path.getParent());
+ try (OutputStream os = Files.newOutputStream(path);
+ OutputStream bos = new BufferedOutputStream(os)) {
+ final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(buffer.position());
- final WritableByteChannel channel = Channels.newChannel(os);
+ final WritableByteChannel channel = Channels.newChannel(bos);
// Write Buffer
buffer.flip(); // set position to 0, set limit to current position
channel.write(buffer);
assert buffer.remaining() == 0L;
- } finally {
- os.close();
}
}
}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
similarity index 79%
rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
index 5a785492789..a45bf479d19 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java
@@ -17,10 +17,10 @@
package org.apache.lucene.analysis.ko.util;
import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.Arrays;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
@@ -29,7 +29,7 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
-public final class CharacterDefinitionWriter {
+final class CharacterDefinitionWriter {
private final byte[] characterCategoryMap = new byte[0x10000];
@@ -39,7 +39,7 @@ public final class CharacterDefinitionWriter {
/**
* Constructor for building. TODO: remove write access
*/
- public CharacterDefinitionWriter() {
+ CharacterDefinitionWriter() {
Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
}
@@ -50,7 +50,7 @@ public final class CharacterDefinitionWriter {
* code point
* @param characterClassName character class name
*/
- public void putCharacterCategory(int codePoint, String characterClassName) {
+ void putCharacterCategory(int codePoint, String characterClassName) {
characterClassName = characterClassName.split(" ")[0]; // use first
// category
// class
@@ -62,20 +62,17 @@ public final class CharacterDefinitionWriter {
characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
}
- public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+ void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
invokeMap[characterClass] = invoke == 1;
groupMap[characterClass] = group == 1;
// TODO: length def ignored
}
- public void write(String baseDir) throws IOException {
- String filename = baseDir + File.separator +
- CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX;
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
+ public void write(Path baseDir) throws IOException {
+ Path path = baseDir.resolve(CharacterDefinition.class.getName().replace('.', '/') + CharacterDefinition.FILENAME_SUFFIX);
+ Files.createDirectories(path.getParent());
+ try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))){
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
@@ -86,8 +83,6 @@ public final class CharacterDefinitionWriter {
);
out.writeByte(b);
}
- } finally {
- os.close();
}
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
new file mode 100644
index 00000000000..34002d2ff6d
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.util;
+
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+class ConnectionCostsBuilder {
+
+ private ConnectionCostsBuilder() {
+ }
+
+ public static ConnectionCostsWriter build(Path path) throws IOException {
+ try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII);
+ LineNumberReader lineReader = new LineNumberReader(reader)) {
+
+ String line = lineReader.readLine();
+ String[] dimensions = line.split("\\s+");
+
+ assert dimensions.length == 2;
+
+ int forwardSize = Integer.parseInt(dimensions[0]);
+ int backwardSize = Integer.parseInt(dimensions[1]);
+
+ assert forwardSize > 0 && backwardSize > 0;
+
+ ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
+
+ while ((line = lineReader.readLine()) != null) {
+ String[] fields = line.split("\\s+");
+
+ assert fields.length == 3;
+
+ int forwardId = Integer.parseInt(fields[0]);
+ int backwardId = Integer.parseInt(fields[1]);
+ int cost = Integer.parseInt(fields[2]);
+
+ costs.add(forwardId, backwardId, cost);
+ }
+ return costs;
+ }
+ }
+}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
similarity index 73%
rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
index f16f8273917..586290d687f 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java
@@ -17,10 +17,10 @@
package org.apache.lucene.analysis.ko.util;
import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
@@ -28,7 +28,7 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
-public final class ConnectionCostsWriter {
+final class ConnectionCostsWriter {
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
private final int forwardSize;
@@ -36,7 +36,7 @@ public final class ConnectionCostsWriter {
/**
* Constructor for building. TODO: remove write access
*/
- public ConnectionCostsWriter(int forwardSize, int backwardSize) {
+ ConnectionCostsWriter(int forwardSize, int backwardSize) {
this.forwardSize = forwardSize;
this.backwardSize = backwardSize;
this.costs = new short[backwardSize][forwardSize];
@@ -46,14 +46,12 @@ public final class ConnectionCostsWriter {
this.costs[backwardId][forwardId] = (short)cost;
}
- public void write(String baseDir) throws IOException {
- String filename = baseDir + File.separator +
- ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX;
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
- final DataOutput out = new OutputStreamDataOutput(os);
+ public void write(Path baseDir) throws IOException {
+ Files.createDirectories(baseDir);
+ String fileName = ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX;
+ try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName));
+ OutputStream bos = new BufferedOutputStream(os)) {
+ final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
out.writeVInt(forwardSize);
out.writeVInt(backwardSize);
@@ -61,14 +59,12 @@ public final class ConnectionCostsWriter {
assert costs.length == backwardSize;
for (short[] a : costs) {
assert a.length == forwardSize;
- for (int i = 0; i < a.length; i++) {
- int delta = (int)a[i] - last;
+ for (short cost : a) {
+ int delta = (int) cost - last;
out.writeZInt(delta);
- last = a[i];
+ last = cost;
}
}
- } finally {
- os.close();
}
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
new file mode 100644
index 00000000000..889f74406d8
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.util;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/**
+ * Tool to build dictionaries.
+ */
+public class DictionaryBuilder {
+
+ private DictionaryBuilder() {
+ }
+
+ public static void build(Path inputDir, Path outputDir, String encoding, boolean normalizeEntry) throws IOException {
+ // Build TokenInfo Dictionary
+ new TokenInfoDictionaryBuilder(encoding, normalizeEntry)
+ .build(inputDir)
+ .write(outputDir);
+
+ // Build Unknown Word Dictionary
+ new UnknownDictionaryBuilder(encoding)
+ .build(inputDir)
+ .write(outputDir);
+
+ // Build Connection Cost
+ ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
+ .write(outputDir);
+ }
+
+ public static void main(String[] args) throws IOException {
+ String inputDirname = args[0];
+ String outputDirname = args[1];
+ String inputEncoding = args[2];
+ boolean normalizeEntries = Boolean.parseBoolean(args[3]);
+ DictionaryBuilder.build(Paths.get(inputDirname), Paths.get(outputDirname), inputEncoding, normalizeEntries);
+ }
+}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
similarity index 50%
rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
index 27c72dadfe6..e4c288b9b2c 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
@@ -17,20 +17,17 @@
package org.apache.lucene.analysis.ko.util;
import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FilenameFilter;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.Comparator;
import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
@@ -38,72 +35,59 @@ import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-public class TokenInfoDictionaryBuilder {
+class TokenInfoDictionaryBuilder {
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
private int offset = 0;
- private String encoding = "utf-8";
-
+ private String encoding;
private Normalizer.Form normalForm;
- public TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
+ TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
this.encoding = encoding;
- this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
+ normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
}
- public TokenInfoDictionaryWriter build(String dirname) throws IOException {
- FilenameFilter filter = (dir, name) -> name.endsWith(".csv");
- ArrayList csvFiles = new ArrayList<>();
- for (File file : new File(dirname).listFiles(filter)) {
- csvFiles.add(file);
+ public TokenInfoDictionaryWriter build(Path dir) throws IOException {
+ try (Stream files = Files.list(dir)) {
+ List csvFiles = files
+ .filter(path -> path.getFileName().toString().endsWith(".csv"))
+ .sorted()
+ .collect(Collectors.toList());
+ return buildDictionary(csvFiles);
}
- Collections.sort(csvFiles);
- return buildDictionary(csvFiles);
}
- public TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IOException {
+ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IOException {
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
-
// all lines in the file
- System.out.println(" parse...");
List lines = new ArrayList<>(400000);
- for (File file : csvFiles){
- FileInputStream inputStream = new FileInputStream(file);
- Charset cs = Charset.forName(encoding);
- CharsetDecoder decoder = cs.newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
- InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
- BufferedReader reader = new BufferedReader(streamReader);
-
- String line = null;
- while ((line = reader.readLine()) != null) {
- String[] entry = CSVUtil.parse(line);
+ for (Path path : csvFiles) {
+ try (BufferedReader reader = Files.newBufferedReader(path, Charset.forName(encoding))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ String[] entry = CSVUtil.parse(line);
- if(entry.length < 12) {
- throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
- }
-
- // NFKC normalize dictionary entry
- if (normalForm != null) {
- String[] normalizedEntry = new String[entry.length];
- for (int i = 0; i < entry.length; i++) {
- normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
+ if (entry.length < 12) {
+ throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
+ }
+
+ // NFKC normalize dictionary entry
+ if (normalForm != null) {
+ String[] normalizedEntry = new String[entry.length];
+ for (int i = 0; i < entry.length; i++) {
+ normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
+ }
+ lines.add(normalizedEntry);
+ } else {
+ lines.add(entry);
}
- lines.add(normalizedEntry);
- } else {
- lines.add(entry);
}
}
}
- System.out.println(" sort...");
-
// sort by term: we sorted the files already and use a stable sort.
- Collections.sort(lines, Comparator.comparing(left -> left[0]));
-
- System.out.println(" encode...");
+ lines.sort(Comparator.comparing(left -> left[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
@@ -111,7 +95,7 @@ public class TokenInfoDictionaryBuilder {
long ord = -1; // first ord will be 0
String lastValue = null;
- // build tokeninfo dictionary
+ // build token info dictionary
for (String[] entry : lines) {
String surfaceForm = entry[0].trim();
if (surfaceForm.isEmpty()) {
@@ -119,9 +103,8 @@ public class TokenInfoDictionaryBuilder {
}
int next = dictionary.put(entry);
- if(next == offset){
- System.out.println("Failed to process line: " + Arrays.toString(entry));
- continue;
+ if(next == offset) {
+ throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
}
if (!surfaceForm.equals(lastValue)) {
@@ -135,16 +118,10 @@ public class TokenInfoDictionaryBuilder {
}
fstBuilder.add(scratch.get(), ord);
}
- dictionary.addMapping((int)ord, offset);
+ dictionary.addMapping((int) ord, offset);
offset = next;
}
-
- final FST fst = fstBuilder.finish();
-
- System.out.print(" " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes... ");
- dictionary.setFST(fst);
- System.out.println(" done");
-
+ dictionary.setFST(fstBuilder.finish());
return dictionary;
}
}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
similarity index 72%
rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
index c1554d2f030..6d3f241c866 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java
@@ -19,31 +19,31 @@ package org.apache.lucene.analysis.ko.util;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.util.Objects;
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.util.fst.FST;
-public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
+class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
private FST fst;
- public TokenInfoDictionaryWriter(int size) {
+ TokenInfoDictionaryWriter(int size) {
super(TokenInfoDictionary.class, size);
}
public void setFST(FST fst) {
+ Objects.requireNonNull(fst, "dictionary must not be empty");
this.fst = fst;
}
@Override
- public void write(String baseDir) throws IOException {
+ public void write(Path baseDir) throws IOException {
super.write(baseDir);
- writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX);
+ writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX));
}
- protected void writeFST(String filename) throws IOException {
- Path p = Paths.get(filename);
- Files.createDirectories(p.getParent());
- fst.save(p);
+ private void writeFST(Path path) throws IOException {
+ Files.createDirectories(path.getParent());
+ fst.save(path);
}
}
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
new file mode 100644
index 00000000000..763169a93f0
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.util;
+
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
+
+class UnknownDictionaryBuilder {
+ private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
+
+ private String encoding;
+
+ UnknownDictionaryBuilder(String encoding) {
+ this.encoding = encoding;
+ }
+
+ public UnknownDictionaryWriter build(Path dir) throws IOException {
+ UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def")); //Should be only one file
+ readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
+ return unkDictionary;
+ }
+
+ private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
+ return readDictionaryFile(path, encoding);
+ }
+
+ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
+ UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
+
+ List lines = new ArrayList<>();
+ try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
+ LineNumberReader lineReader = new LineNumberReader(reader)) {
+
+ dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
+
+ String line;
+ while ((line = lineReader.readLine()) != null) {
+ // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
+ // even though the unknown dictionary returns hardcoded null here.
+ final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+ lines.add(parsed);
+ }
+ }
+
+ lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));
+
+ for (String[] entry : lines) {
+ dictionary.put(entry);
+ }
+
+ return dictionary;
+ }
+
+ private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
+ try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
+ LineNumberReader lineReader = new LineNumberReader(reader)) {
+
+ String line;
+ while ((line = lineReader.readLine()) != null) {
+ line = line.replaceAll("^\\s", "");
+ line = line.replaceAll("\\s*#.*", "");
+ line = line.replaceAll("\\s+", " ");
+
+ // Skip empty line or comment line
+ if (line.length() == 0) {
+ continue;
+ }
+
+ if (line.startsWith("0x")) { // Category mapping
+ String[] values = line.split(" ", 2); // Split only first space
+
+ if (!values[0].contains("..")) {
+ int cp = Integer.decode(values[0]);
+ dictionary.putCharacterCategory(cp, values[1]);
+ } else {
+ String[] codePoints = values[0].split("\\.\\.");
+ int cpFrom = Integer.decode(codePoints[0]);
+ int cpTo = Integer.decode(codePoints[1]);
+
+ for (int i = cpFrom; i <= cpTo; i++) {
+ dictionary.putCharacterCategory(i, values[1]);
+ }
+ }
+ } else { // Invoke definition
+ String[] values = line.split(" "); // Consecutive space is merged above
+ String characterClassName = values[0];
+ int invoke = Integer.parseInt(values[1]);
+ int group = Integer.parseInt(values[2]);
+ int length = Integer.parseInt(values[3]);
+ dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
+ }
+ }
+ }
+ }
+}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
similarity index 93%
rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
index ff98a8dc414..d4f3fb181a2 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java
@@ -17,11 +17,12 @@
package org.apache.lucene.analysis.ko.util;
import java.io.IOException;
+import java.nio.file.Path;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
-public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
+class UnknownDictionaryWriter extends BinaryDictionaryWriter {
private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter();
@@ -58,7 +59,7 @@ public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
}
@Override
- public void write(String baseDir) throws IOException {
+ public void write(Path baseDir) throws IOException {
super.write(baseDir);
characterDefinition.write(baseDir);
}
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
index fa0cb321fd0..4bacb9ba5af 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
similarity index 63%
rename from lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
rename to lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
index 3457de179d6..976789dc29f 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
@@ -16,15 +16,74 @@
*/
package org.apache.lucene.analysis.ko.dict;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
import org.apache.lucene.analysis.ko.POS;
+import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
-import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
-public class TestTokenInfoDictionary extends LuceneTestCase {
+import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
+
+/**
+ * Tests of TokenInfoDictionary build tools; run using ant test-tools
+ */
+public class TokenInfoDictionaryTest extends LuceneTestCase {
+
+ public void testPut() throws Exception {
+ TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
+ // "large" id
+ "일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
+ IntsRef wordIdRef = new IntsRefBuilder().get();
+
+ dict.lookupWordIds(0, wordIdRef);
+ int wordId = wordIdRef.ints[wordIdRef.offset];
+ assertEquals(1, dict.getLeftId(wordId));
+ assertEquals(1, dict.getRightId(wordId));
+ assertEquals(2, dict.getWordCost(wordId));
+
+ dict.lookupWordIds(1, wordIdRef);
+ wordId = wordIdRef.ints[wordIdRef.offset];
+ assertEquals(5000, dict.getLeftId(wordId));
+ assertEquals(5000, dict.getRightId(wordId));
+ assertEquals(3, dict.getWordCost(wordId));
+ }
+
+ private TokenInfoDictionary newDictionary(String... entries) throws Exception {
+ Path dir = createTempDir();
+ try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
+ PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
+ for (String entry : entries) {
+ printer.println(entry);
+ }
+ }
+ Files.createFile(dir.resolve("unk.def"));
+ Files.createFile(dir.resolve("char.def"));
+ try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def"));
+ PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
+ printer.println("1 1");
+ }
+ DictionaryBuilder.build(dir, dir, "utf-8", true);
+ String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
+ // We must also load the other files (in BinaryDictionary) from the correct path
+ return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
+ }
+
+ public void testPutException() {
+ //too few columns
+ expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
+ // id too large
+ expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
+ }
/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
@@ -38,12 +97,12 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
ConnectionCosts matrix = ConnectionCosts.getInstance();
FST fst = tid.getFST().getInternalFST();
IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(fst);
- InputOutput mapping;
+ IntsRefFSTEnum.InputOutput mapping;
IntsRef scratch = new IntsRef();
while ((mapping = fstEnum.next()) != null) {
numTerms++;
IntsRef input = mapping.input;
- char chars[] = new char[input.length];
+ char[] chars = new char[input.length];
for (int i = 0; i < chars.length; i++) {
chars[i] = (char)input.ints[input.offset+i];
}
@@ -51,7 +110,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
assertFalse(surfaceForm.isEmpty());
assertEquals(surfaceForm.trim(), surfaceForm);
assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
-
+
Long output = mapping.output;
int sourceId = output.intValue();
// we walk in order, terms, sourceIds, and wordIds should always be increasing
diff --git a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/util/UnknownDictionaryTest.java
similarity index 93%
rename from lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java
rename to lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/util/UnknownDictionaryTest.java
index cf5d6b7ee1d..e7f69188615 100644
--- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/util/UnknownDictionaryTest.java
@@ -14,11 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.analysis.ko.dict;
+package org.apache.lucene.analysis.ko.util;
-
-import org.apache.lucene.analysis.ko.util.CSVUtil;
-import org.apache.lucene.analysis.ko.util.UnknownDictionaryWriter;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
deleted file mode 100644
index 29659de3819..00000000000
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ko.util;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.nio.charset.StandardCharsets;
-
-public class ConnectionCostsBuilder {
-
- private ConnectionCostsBuilder() {
- }
-
- public static ConnectionCostsWriter build(String filename) throws IOException {
- FileInputStream inputStream = new FileInputStream(filename);
- Charset cs = StandardCharsets.US_ASCII;
- CharsetDecoder decoder = cs.newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
- InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
- LineNumberReader lineReader = new LineNumberReader(streamReader);
-
- String line = lineReader.readLine();
- String[] dimensions = line.split("\\s+");
-
- assert dimensions.length == 2;
-
- int forwardSize = Integer.parseInt(dimensions[0]);
- int backwardSize = Integer.parseInt(dimensions[1]);
-
- assert forwardSize > 0 && backwardSize > 0;
-
- ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
-
- while ((line = lineReader.readLine()) != null) {
- String[] fields = line.split("\\s+");
-
- assert fields.length == 3;
-
- int forwardId = Integer.parseInt(fields[0]);
- int backwardId = Integer.parseInt(fields[1]);
- int cost = Integer.parseInt(fields[2]);
-
- costs.add(forwardId, backwardId, cost);
- }
- return costs;
- }
-}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
deleted file mode 100644
index e0039a27125..00000000000
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ko.util;
-
-import java.io.File;
-import java.io.IOException;
-
-public class DictionaryBuilder {
-
- private DictionaryBuilder() {
- }
-
- public static void build(String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException {
- System.out.println("building tokeninfo dict...");
- TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(encoding, normalizeEntry);
- TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
- tokenInfoDictionary.write(outputDirname);
- tokenInfoDictionary = null;
- tokenInfoBuilder = null;
- System.out.println("done");
-
- System.out.print("building unknown word dict...");
- UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
- UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
- unkDictionary.write(outputDirname);
- unkDictionary = null;
- unkBuilder = null;
- System.out.println("done");
-
- System.out.print("building connection costs...");
- ConnectionCostsWriter connectionCosts
- = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
- connectionCosts.write(outputDirname);
- System.out.println("done");
- }
-
- public static void main(String[] args) throws IOException {
- String inputDirname = args[0];
- String outputDirname = args[1];
- String inputEncoding = args[2];
- boolean normalizeEntries = Boolean.parseBoolean(args[3]);
-
- System.out.println("dictionary builder");
- System.out.println("");
- System.out.println("input directory: " + inputDirname);
- System.out.println("output directory: " + outputDirname);
- System.out.println("input encoding: " + inputEncoding);
- System.out.println("normalize entries: " + normalizeEntries);
- System.out.println("");
- DictionaryBuilder.build(inputDirname, outputDirname, inputEncoding, normalizeEntries);
- }
-
-}
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
deleted file mode 100644
index a4088664ce3..00000000000
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ko.util;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
-
-public class UnknownDictionaryBuilder {
- private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
-
- private String encoding = "utf-8";
-
- public UnknownDictionaryBuilder(String encoding) {
- this.encoding = encoding;
- }
-
- public UnknownDictionaryWriter build(String dirname) throws IOException {
- UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
- readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
- return unkDictionary;
- }
-
- public UnknownDictionaryWriter readDictionaryFile(String filename)
- throws IOException {
- return readDictionaryFile(filename, encoding);
- }
-
- public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
- throws IOException {
- UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
-
- FileInputStream inputStream = new FileInputStream(filename);
- Charset cs = Charset.forName(encoding);
- CharsetDecoder decoder = cs.newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
- InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
- LineNumberReader lineReader = new LineNumberReader(streamReader);
-
- dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
-
- List lines = new ArrayList<>();
- String line = null;
- while ((line = lineReader.readLine()) != null) {
- // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
- // even though the unknown dictionary returns hardcoded null here.
- final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
- lines.add(parsed);
- }
-
- Collections.sort(lines, new Comparator() {
- public int compare(String[] left, String[] right) {
- int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
- int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
- return leftId - rightId;
- }
- });
-
- for (String[] entry : lines) {
- dictionary.put(entry);
- }
-
- return dictionary;
- }
-
- public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
- FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
- LineNumberReader lineReader = new LineNumberReader(streamReader);
-
- String line = null;
-
- while ((line = lineReader.readLine()) != null) {
- line = line.replaceAll("^\\s", "");
- line = line.replaceAll("\\s*#.*", "");
- line = line.replaceAll("\\s+", " ");
-
- // Skip empty line or comment line
- if(line.length() == 0) {
- continue;
- }
-
- if(line.startsWith("0x")) { // Category mapping
- String[] values = line.split(" ", 2); // Split only first space
-
- if(!values[0].contains("..")) {
- int cp = Integer.decode(values[0]).intValue();
- dictionary.putCharacterCategory(cp, values[1]);
- } else {
- String[] codePoints = values[0].split("\\.\\.");
- int cpFrom = Integer.decode(codePoints[0]).intValue();
- int cpTo = Integer.decode(codePoints[1]).intValue();
-
- for(int i = cpFrom; i <= cpTo; i++){
- dictionary.putCharacterCategory(i, values[1]);
- }
- }
- } else { // Invoke definition
- String[] values = line.split(" "); // Consecutive space is merged above
- String characterClassName = values[0];
- int invoke = Integer.parseInt(values[1]);
- int group = Integer.parseInt(values[2]);
- int length = Integer.parseInt(values[3]);
- dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
- }
- }
- }
-}
diff --git a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
deleted file mode 100644
index 492abea664b..00000000000
--- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ko.dict;
-
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryBuilder;
-import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryWriter;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.LuceneTestCase;
-
-import static java.io.File.separatorChar;
-import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
-
-/**
- * Tests of TokenInfoDictionary build tools; run using ant test-tools
- */
-public class TokenInfoDictionaryTest extends LuceneTestCase {
-
- public void testPut() throws Exception {
- TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
- // "large" id
- "일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
- IntsRef wordIdRef = new IntsRefBuilder().get();
-
- dict.lookupWordIds(0, wordIdRef);
- int wordId = wordIdRef.ints[wordIdRef.offset];
- assertEquals(1, dict.getLeftId(wordId));
- assertEquals(1, dict.getRightId(wordId));
- assertEquals(2, dict.getWordCost(wordId));
-
- dict.lookupWordIds(1, wordIdRef);
- wordId = wordIdRef.ints[wordIdRef.offset];
- assertEquals(5000, dict.getLeftId(wordId));
- assertEquals(5000, dict.getRightId(wordId));
- assertEquals(3, dict.getWordCost(wordId));
- }
-
- private TokenInfoDictionary newDictionary(String... entries) throws Exception {
- Path dir = createTempDir();
- try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
- PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
- for (String entry : entries) {
- printer.println(entry);
- }
- }
- TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder("utf-8", true);
- TokenInfoDictionaryWriter writer = builder.build(dir.toString());
- writer.write(dir.toString());
- String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
- // We must also load the other files (in BinaryDictionary) from the correct path
- return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
- }
-
- public void testPutException() throws Exception {
- // too few columns
- expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
- // id too large
- expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
- }
-}