diff --git a/lucene/analysis/nori/build.xml b/lucene/analysis/nori/build.xml index 6b82816b807..7d5b0b99f41 100644 --- a/lucene/analysis/nori/build.xml +++ b/lucene/analysis/nori/build.xml @@ -26,7 +26,6 @@ - @@ -45,6 +44,9 @@ + + + @@ -57,28 +59,14 @@ - - - - - - - - - - - - - + - - - + @@ -90,34 +78,7 @@ - - - - - - - - - - - - - - - - - - - - - - - + diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index 7c645a5cc11..02481e12e0e 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -21,7 +21,6 @@ import java.io.InputStream; import java.io.IOException; import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -46,20 +45,9 @@ public final class TokenInfoDictionary extends BinaryDictionary { */ TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException { super(resourceScheme, resourcePath); - InputStream is = null; FST fst; - boolean success = false; - try { - is = getResource(FST_FILENAME_SUFFIX); - is = new BufferedInputStream(is); + try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) { fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton()); - success = true; - } finally { - if (success) { - IOUtils.close(is); - } else { - IOUtils.closeWhileHandlingException(is); - } } this.fst = new TokenInfoFST(fst); } diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java similarity index 83% rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java index db57d4fd66f..6a19b1b56a8 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java @@ -17,13 +17,13 @@ package org.apache.lucene.analysis.ko.util; import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -37,17 +37,17 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.analysis.ko.dict.BinaryDictionary; -public abstract class BinaryDictionaryWriter { +abstract class BinaryDictionaryWriter { private final static int ID_LIMIT = 8192; - protected final Class implClazz; + private final Class implClazz; protected ByteBuffer buffer; private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; private int[] targetMap = new int[8192]; private int[] targetMapOffsets = new int[8192]; private final ArrayList posDict = new ArrayList<>(); - public BinaryDictionaryWriter(Class implClazz, int size) { + BinaryDictionaryWriter(Class implClazz, int size) { this.implClazz = implClazz; buffer = ByteBuffer.allocate(size); } @@ -183,7 +183,7 @@ public abstract class BinaryDictionaryWriter { } } - public void addMapping(int sourceId, int wordId) { + void addMapping(int sourceId, int wordId) { if (wordId <= lastWordId) { throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId); } @@ -205,27 +205,26 @@ public abstract class BinaryDictionaryWriter { lastWordId = wordId; } - protected final String getBaseFileName(String baseDir) { - return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar); + final String getBaseFileName() { + return implClazz.getName().replace('.', '/'); } /** * Write dictionary in file * @throws IOException if an I/O error occurs writing the dictionary files */ - public void write(String baseDir) throws IOException { - final String baseName = getBaseFileName(baseDir); - writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX); - writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX); - writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX); + public void write(Path baseDir) throws IOException { + final String baseName = getBaseFileName(); + writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX)); + writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX)); + writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX)); } - protected void writeTargetMap(String filename) throws IOException { - new File(filename).getParentFile().mkdirs(); - OutputStream os = new FileOutputStream(filename); - try { - os = new BufferedOutputStream(os); - final DataOutput out = new OutputStreamDataOutput(os); + private void writeTargetMap(Path path) throws IOException { + Files.createDirectories(path.getParent()); + try (OutputStream os = Files.newOutputStream(path); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION); final int numSourceIds = lastSourceId + 1; @@ -246,17 +245,14 @@ public abstract class BinaryDictionaryWriter { if (sourceId != numSourceIds) { throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); } - } finally { - os.close(); } } - protected void writePosDict(String filename) throws IOException { - new File(filename).getParentFile().mkdirs(); - OutputStream os = new FileOutputStream(filename); - try { - os = new BufferedOutputStream(os); - final DataOutput out = new OutputStreamDataOutput(os); + private void writePosDict(Path path) throws IOException { + Files.createDirectories(path.getParent()); + try (OutputStream os = Files.newOutputStream(path); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION); out.writeVInt(posDict.size()); for (String s : posDict) { @@ -270,25 +266,21 @@ public abstract class BinaryDictionaryWriter { out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal()); } } - } finally { - os.close(); } } - protected void writeDictionary(String filename) throws IOException { - new File(filename).getParentFile().mkdirs(); - final FileOutputStream os = new FileOutputStream(filename); - try { - final DataOutput out = new OutputStreamDataOutput(os); + private void writeDictionary(Path path) throws IOException { + Files.createDirectories(path.getParent()); + try (OutputStream os = Files.newOutputStream(path); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION); out.writeVInt(buffer.position()); - final WritableByteChannel channel = Channels.newChannel(os); + final WritableByteChannel channel = Channels.newChannel(bos); // Write Buffer buffer.flip(); // set position to 0, set limit to current position channel.write(buffer); assert buffer.remaining() == 0L; - } finally { - os.close(); } } } diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java similarity index 79% rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java index 5a785492789..a45bf479d19 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CharacterDefinitionWriter.java @@ -17,10 +17,10 @@ package org.apache.lucene.analysis.ko.util; import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Arrays; import org.apache.lucene.analysis.ko.dict.CharacterDefinition; @@ -29,7 +29,7 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.OutputStreamDataOutput; -public final class CharacterDefinitionWriter { +final class CharacterDefinitionWriter { private final byte[] characterCategoryMap = new byte[0x10000]; @@ -39,7 +39,7 @@ public final class CharacterDefinitionWriter { /** * Constructor for building. TODO: remove write access */ - public CharacterDefinitionWriter() { + CharacterDefinitionWriter() { Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT); } @@ -50,7 +50,7 @@ public final class CharacterDefinitionWriter { * code point * @param characterClassName character class name */ - public void putCharacterCategory(int codePoint, String characterClassName) { + void putCharacterCategory(int codePoint, String characterClassName) { characterClassName = characterClassName.split(" ")[0]; // use first // category // class @@ -62,20 +62,17 @@ public final class CharacterDefinitionWriter { characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName); } - public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { + void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName); invokeMap[characterClass] = invoke == 1; groupMap[characterClass] = group == 1; // TODO: length def ignored } - public void write(String baseDir) throws IOException { - String filename = baseDir + File.separator + - CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX; - new File(filename).getParentFile().mkdirs(); - OutputStream os = new FileOutputStream(filename); - try { - os = new BufferedOutputStream(os); + public void write(Path baseDir) throws IOException { + Path path = baseDir.resolve(CharacterDefinition.class.getName().replace('.', '/') + CharacterDefinition.FILENAME_SUFFIX); + Files.createDirectories(path.getParent()); + try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))){ final DataOutput out = new OutputStreamDataOutput(os); CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION); out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length); @@ -86,8 +83,6 @@ public final class CharacterDefinitionWriter { ); out.writeByte(b); } - } finally { - os.close(); } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java new file mode 100644 index 00000000000..34002d2ff6d --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.util; + +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +class ConnectionCostsBuilder { + + private ConnectionCostsBuilder() { + } + + public static ConnectionCostsWriter build(Path path) throws IOException { + try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII); + LineNumberReader lineReader = new LineNumberReader(reader)) { + + String line = lineReader.readLine(); + String[] dimensions = line.split("\\s+"); + + assert dimensions.length == 2; + + int forwardSize = Integer.parseInt(dimensions[0]); + int backwardSize = Integer.parseInt(dimensions[1]); + + assert forwardSize > 0 && backwardSize > 0; + + ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize); + + while ((line = lineReader.readLine()) != null) { + String[] fields = line.split("\\s+"); + + assert fields.length == 3; + + int forwardId = Integer.parseInt(fields[0]); + int backwardId = Integer.parseInt(fields[1]); + int cost = Integer.parseInt(fields[2]); + + costs.add(forwardId, backwardId, cost); + } + return costs; + } + } +} diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java similarity index 73% rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java index f16f8273917..586290d687f 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/ConnectionCostsWriter.java @@ -17,10 +17,10 @@ package org.apache.lucene.analysis.ko.util; import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; import org.apache.lucene.analysis.ko.dict.ConnectionCosts; @@ -28,7 +28,7 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.OutputStreamDataOutput; -public final class ConnectionCostsWriter { +final class ConnectionCostsWriter { private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter. private final int forwardSize; @@ -36,7 +36,7 @@ public final class ConnectionCostsWriter { /** * Constructor for building. TODO: remove write access */ - public ConnectionCostsWriter(int forwardSize, int backwardSize) { + ConnectionCostsWriter(int forwardSize, int backwardSize) { this.forwardSize = forwardSize; this.backwardSize = backwardSize; this.costs = new short[backwardSize][forwardSize]; @@ -46,14 +46,12 @@ public final class ConnectionCostsWriter { this.costs[backwardId][forwardId] = (short)cost; } - public void write(String baseDir) throws IOException { - String filename = baseDir + File.separator + - ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX; - new File(filename).getParentFile().mkdirs(); - OutputStream os = new FileOutputStream(filename); - try { - os = new BufferedOutputStream(os); - final DataOutput out = new OutputStreamDataOutput(os); + public void write(Path baseDir) throws IOException { + Files.createDirectories(baseDir); + String fileName = ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX; + try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName)); + OutputStream bos = new BufferedOutputStream(os)) { + final DataOutput out = new OutputStreamDataOutput(bos); CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION); out.writeVInt(forwardSize); out.writeVInt(backwardSize); @@ -61,14 +59,12 @@ public final class ConnectionCostsWriter { assert costs.length == backwardSize; for (short[] a : costs) { assert a.length == forwardSize; - for (int i = 0; i < a.length; i++) { - int delta = (int)a[i] - last; + for (short cost : a) { + int delta = (int) cost - last; out.writeZInt(delta); - last = a[i]; + last = cost; } } - } finally { - os.close(); } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java new file mode 100644 index 00000000000..889f74406d8 --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.util; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * Tool to build dictionaries. + */ +public class DictionaryBuilder { + + private DictionaryBuilder() { + } + + public static void build(Path inputDir, Path outputDir, String encoding, boolean normalizeEntry) throws IOException { + // Build TokenInfo Dictionary + new TokenInfoDictionaryBuilder(encoding, normalizeEntry) + .build(inputDir) + .write(outputDir); + + // Build Unknown Word Dictionary + new UnknownDictionaryBuilder(encoding) + .build(inputDir) + .write(outputDir); + + // Build Connection Cost + ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")) + .write(outputDir); + } + + public static void main(String[] args) throws IOException { + String inputDirname = args[0]; + String outputDirname = args[1]; + String inputEncoding = args[2]; + boolean normalizeEntries = Boolean.parseBoolean(args[3]); + DictionaryBuilder.build(Paths.get(inputDirname), Paths.get(outputDirname), inputEncoding, normalizeEntries); + } +} diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java similarity index 50% rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java index 27c72dadfe6..e4c288b9b2c 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java @@ -17,20 +17,17 @@ package org.apache.lucene.analysis.ko.util; import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FilenameFilter; import java.io.IOException; -import java.io.InputStreamReader; import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; +import java.nio.file.Files; +import java.nio.file.Path; import java.text.Normalizer; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.Builder; @@ -38,72 +35,59 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; -public class TokenInfoDictionaryBuilder { +class TokenInfoDictionaryBuilder { /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */ private int offset = 0; - private String encoding = "utf-8"; - + private String encoding; private Normalizer.Form normalForm; - public TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) { + TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) { this.encoding = encoding; - this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null; + normalForm = normalizeEntries ? Normalizer.Form.NFKC : null; } - public TokenInfoDictionaryWriter build(String dirname) throws IOException { - FilenameFilter filter = (dir, name) -> name.endsWith(".csv"); - ArrayList csvFiles = new ArrayList<>(); - for (File file : new File(dirname).listFiles(filter)) { - csvFiles.add(file); + public TokenInfoDictionaryWriter build(Path dir) throws IOException { + try (Stream files = Files.list(dir)) { + List csvFiles = files + .filter(path -> path.getFileName().toString().endsWith(".csv")) + .sorted() + .collect(Collectors.toList()); + return buildDictionary(csvFiles); } - Collections.sort(csvFiles); - return buildDictionary(csvFiles); } - public TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IOException { + private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IOException { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); - // all lines in the file - System.out.println(" parse..."); List lines = new ArrayList<>(400000); - for (File file : csvFiles){ - FileInputStream inputStream = new FileInputStream(file); - Charset cs = Charset.forName(encoding); - CharsetDecoder decoder = cs.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - InputStreamReader streamReader = new InputStreamReader(inputStream, decoder); - BufferedReader reader = new BufferedReader(streamReader); - - String line = null; - while ((line = reader.readLine()) != null) { - String[] entry = CSVUtil.parse(line); + for (Path path : csvFiles) { + try (BufferedReader reader = Files.newBufferedReader(path, Charset.forName(encoding))) { + String line; + while ((line = reader.readLine()) != null) { + String[] entry = CSVUtil.parse(line); - if(entry.length < 12) { - throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line); - } - - // NFKC normalize dictionary entry - if (normalForm != null) { - String[] normalizedEntry = new String[entry.length]; - for (int i = 0; i < entry.length; i++) { - normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm); + if (entry.length < 12) { + throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line); + } + + // NFKC normalize dictionary entry + if (normalForm != null) { + String[] normalizedEntry = new String[entry.length]; + for (int i = 0; i < entry.length; i++) { + normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm); + } + lines.add(normalizedEntry); + } else { + lines.add(entry); } - lines.add(normalizedEntry); - } else { - lines.add(entry); } } } - System.out.println(" sort..."); - // sort by term: we sorted the files already and use a stable sort. - Collections.sort(lines, Comparator.comparing(left -> left[0])); - - System.out.println(" encode..."); + lines.sort(Comparator.comparing(left -> left[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); @@ -111,7 +95,7 @@ public class TokenInfoDictionaryBuilder { long ord = -1; // first ord will be 0 String lastValue = null; - // build tokeninfo dictionary + // build token info dictionary for (String[] entry : lines) { String surfaceForm = entry[0].trim(); if (surfaceForm.isEmpty()) { @@ -119,9 +103,8 @@ public class TokenInfoDictionaryBuilder { } int next = dictionary.put(entry); - if(next == offset){ - System.out.println("Failed to process line: " + Arrays.toString(entry)); - continue; + if(next == offset) { + throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry)); } if (!surfaceForm.equals(lastValue)) { @@ -135,16 +118,10 @@ public class TokenInfoDictionaryBuilder { } fstBuilder.add(scratch.get(), ord); } - dictionary.addMapping((int)ord, offset); + dictionary.addMapping((int) ord, offset); offset = next; } - - final FST fst = fstBuilder.finish(); - - System.out.print(" " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes... "); - dictionary.setFST(fst); - System.out.println(" done"); - + dictionary.setFST(fstBuilder.finish()); return dictionary; } } diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java similarity index 72% rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java index c1554d2f030..6d3f241c866 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryWriter.java @@ -19,31 +19,31 @@ package org.apache.lucene.analysis.ko.util; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; +import java.util.Objects; import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary; import org.apache.lucene.util.fst.FST; -public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter { +class TokenInfoDictionaryWriter extends BinaryDictionaryWriter { private FST fst; - public TokenInfoDictionaryWriter(int size) { + TokenInfoDictionaryWriter(int size) { super(TokenInfoDictionary.class, size); } public void setFST(FST fst) { + Objects.requireNonNull(fst, "dictionary must not be empty"); this.fst = fst; } @Override - public void write(String baseDir) throws IOException { + public void write(Path baseDir) throws IOException { super.write(baseDir); - writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX); + writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX)); } - protected void writeFST(String filename) throws IOException { - Path p = Paths.get(filename); - Files.createDirectories(p.getParent()); - fst.save(p); + private void writeFST(Path path) throws IOException { + Files.createDirectories(path.getParent()); + fst.save(path); } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java new file mode 100644 index 00000000000..763169a93f0 --- /dev/null +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.util; + +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.analysis.ko.dict.CharacterDefinition; + +class UnknownDictionaryBuilder { + private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*"; + + private String encoding; + + UnknownDictionaryBuilder(String encoding) { + this.encoding = encoding; + } + + public UnknownDictionaryWriter build(Path dir) throws IOException { + UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def")); //Should be only one file + readCharacterDefinition(dir.resolve("char.def"), unkDictionary); + return unkDictionary; + } + + private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException { + return readDictionaryFile(path, encoding); + } + + private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException { + UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); + + List lines = new ArrayList<>(); + try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding)); + LineNumberReader lineReader = new LineNumberReader(reader)) { + + dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY)); + + String line; + while ((line = lineReader.readLine()) != null) { + // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, + // even though the unknown dictionary returns hardcoded null here. + final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry + lines.add(parsed); + } + } + + lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0]))); + + for (String[] entry : lines) { + dictionary.put(entry); + } + + return dictionary; + } + + private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException { + try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding)); + LineNumberReader lineReader = new LineNumberReader(reader)) { + + String line; + while ((line = lineReader.readLine()) != null) { + line = line.replaceAll("^\\s", ""); + line = line.replaceAll("\\s*#.*", ""); + line = line.replaceAll("\\s+", " "); + + // Skip empty line or comment line + if (line.length() == 0) { + continue; + } + + if (line.startsWith("0x")) { // Category mapping + String[] values = line.split(" ", 2); // Split only first space + + if (!values[0].contains("..")) { + int cp = Integer.decode(values[0]); + dictionary.putCharacterCategory(cp, values[1]); + } else { + String[] codePoints = values[0].split("\\.\\."); + int cpFrom = Integer.decode(codePoints[0]); + int cpTo = Integer.decode(codePoints[1]); + + for (int i = cpFrom; i <= cpTo; i++) { + dictionary.putCharacterCategory(i, values[1]); + } + } + } else { // Invoke definition + String[] values = line.split(" "); // Consecutive space is merged above + String characterClassName = values[0]; + int invoke = Integer.parseInt(values[1]); + int group = Integer.parseInt(values[2]); + int length = Integer.parseInt(values[3]); + dictionary.putInvokeDefinition(characterClassName, invoke, group, length); + } + } + } + } +} diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java similarity index 93% rename from lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java rename to lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java index ff98a8dc414..d4f3fb181a2 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryWriter.java @@ -17,11 +17,12 @@ package org.apache.lucene.analysis.ko.util; import java.io.IOException; +import java.nio.file.Path; import org.apache.lucene.analysis.ko.dict.CharacterDefinition; import org.apache.lucene.analysis.ko.dict.UnknownDictionary; -public class UnknownDictionaryWriter extends BinaryDictionaryWriter { +class UnknownDictionaryWriter extends BinaryDictionaryWriter { private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter(); @@ -58,7 +59,7 @@ public class UnknownDictionaryWriter extends BinaryDictionaryWriter { } @Override - public void write(String baseDir) throws IOException { + public void write(Path baseDir) throws IOException { super.write(baseDir); characterDefinition.write(baseDir); } diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat index fa0cb321fd0..4bacb9ba5af 100644 Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat differ diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java similarity index 63% rename from lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java rename to lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java index 3457de179d6..976789dc29f 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java @@ -16,15 +16,74 @@ */ package org.apache.lucene.analysis.ko.dict; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + import org.apache.lucene.analysis.ko.POS; +import org.apache.lucene.analysis.ko.util.DictionaryBuilder; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.IntsRefFSTEnum; -import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput; -public class TestTokenInfoDictionary extends LuceneTestCase { +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme; + +/** + * Tests of TokenInfoDictionary build tools; run using ant test-tools + */ +public class TokenInfoDictionaryTest extends LuceneTestCase { + + public void testPut() throws Exception { + TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*", + // "large" id + "일반,5000,5000,3,NNG,*,*,*,*,*,*,*"); + IntsRef wordIdRef = new IntsRefBuilder().get(); + + dict.lookupWordIds(0, wordIdRef); + int wordId = wordIdRef.ints[wordIdRef.offset]; + assertEquals(1, dict.getLeftId(wordId)); + assertEquals(1, dict.getRightId(wordId)); + assertEquals(2, dict.getWordCost(wordId)); + + dict.lookupWordIds(1, wordIdRef); + wordId = wordIdRef.ints[wordIdRef.offset]; + assertEquals(5000, dict.getLeftId(wordId)); + assertEquals(5000, dict.getRightId(wordId)); + assertEquals(3, dict.getWordCost(wordId)); + } + + private TokenInfoDictionary newDictionary(String... entries) throws Exception { + Path dir = createTempDir(); + try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv")); + PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) { + for (String entry : entries) { + printer.println(entry); + } + } + Files.createFile(dir.resolve("unk.def")); + Files.createFile(dir.resolve("char.def")); + try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def")); + PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) { + printer.println("1 1"); + } + DictionaryBuilder.build(dir, dir, "utf-8", true); + String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/'); + // We must also load the other files (in BinaryDictionary) from the correct path + return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString()); + } + + public void testPutException() { + //too few columns + expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*")); + // id too large + expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*")); + } /** enumerates the entire FST/lookup data and just does basic sanity checks */ public void testEnumerateAll() throws Exception { @@ -38,12 +97,12 @@ public class TestTokenInfoDictionary extends LuceneTestCase { ConnectionCosts matrix = ConnectionCosts.getInstance(); FST fst = tid.getFST().getInternalFST(); IntsRefFSTEnum fstEnum = new IntsRefFSTEnum<>(fst); - InputOutput mapping; + IntsRefFSTEnum.InputOutput mapping; IntsRef scratch = new IntsRef(); while ((mapping = fstEnum.next()) != null) { numTerms++; IntsRef input = mapping.input; - char chars[] = new char[input.length]; + char[] chars = new char[input.length]; for (int i = 0; i < chars.length; i++) { chars[i] = (char)input.ints[input.offset+i]; } @@ -51,7 +110,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase { assertFalse(surfaceForm.isEmpty()); assertEquals(surfaceForm.trim(), surfaceForm); assertTrue(UnicodeUtil.validUTF16String(surfaceForm)); - + Long output = mapping.output; int sourceId = output.intValue(); // we walk in order, terms, sourceIds, and wordIds should always be increasing diff --git a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/util/UnknownDictionaryTest.java similarity index 93% rename from lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java rename to lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/util/UnknownDictionaryTest.java index cf5d6b7ee1d..e7f69188615 100644 --- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/util/UnknownDictionaryTest.java @@ -14,11 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ko.dict; +package org.apache.lucene.analysis.ko.util; - -import org.apache.lucene.analysis.ko.util.CSVUtil; -import org.apache.lucene.analysis.ko.util.UnknownDictionaryWriter; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java deleted file mode 100644 index 29659de3819..00000000000 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/ConnectionCostsBuilder.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.util; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.LineNumberReader; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.nio.charset.StandardCharsets; - -public class ConnectionCostsBuilder { - - private ConnectionCostsBuilder() { - } - - public static ConnectionCostsWriter build(String filename) throws IOException { - FileInputStream inputStream = new FileInputStream(filename); - Charset cs = StandardCharsets.US_ASCII; - CharsetDecoder decoder = cs.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - InputStreamReader streamReader = new InputStreamReader(inputStream, decoder); - LineNumberReader lineReader = new LineNumberReader(streamReader); - - String line = lineReader.readLine(); - String[] dimensions = line.split("\\s+"); - - assert dimensions.length == 2; - - int forwardSize = Integer.parseInt(dimensions[0]); - int backwardSize = Integer.parseInt(dimensions[1]); - - assert forwardSize > 0 && backwardSize > 0; - - ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize); - - while ((line = lineReader.readLine()) != null) { - String[] fields = line.split("\\s+"); - - assert fields.length == 3; - - int forwardId = Integer.parseInt(fields[0]); - int backwardId = Integer.parseInt(fields[1]); - int cost = Integer.parseInt(fields[2]); - - costs.add(forwardId, backwardId, cost); - } - return costs; - } -} diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java deleted file mode 100644 index e0039a27125..00000000000 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/DictionaryBuilder.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.util; - -import java.io.File; -import java.io.IOException; - -public class DictionaryBuilder { - - private DictionaryBuilder() { - } - - public static void build(String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException { - System.out.println("building tokeninfo dict..."); - TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(encoding, normalizeEntry); - TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname); - tokenInfoDictionary.write(outputDirname); - tokenInfoDictionary = null; - tokenInfoBuilder = null; - System.out.println("done"); - - System.out.print("building unknown word dict..."); - UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding); - UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname); - unkDictionary.write(outputDirname); - unkDictionary = null; - unkBuilder = null; - System.out.println("done"); - - System.out.print("building connection costs..."); - ConnectionCostsWriter connectionCosts - = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def"); - connectionCosts.write(outputDirname); - System.out.println("done"); - } - - public static void main(String[] args) throws IOException { - String inputDirname = args[0]; - String outputDirname = args[1]; - String inputEncoding = args[2]; - boolean normalizeEntries = Boolean.parseBoolean(args[3]); - - System.out.println("dictionary builder"); - System.out.println(""); - System.out.println("input directory: " + inputDirname); - System.out.println("output directory: " + outputDirname); - System.out.println("input encoding: " + inputEncoding); - System.out.println("normalize entries: " + normalizeEntries); - System.out.println(""); - DictionaryBuilder.build(inputDirname, outputDirname, inputEncoding, normalizeEntries); - } - -} diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java deleted file mode 100644 index a4088664ce3..00000000000 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/UnknownDictionaryBuilder.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.util; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.LineNumberReader; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; - -import org.apache.lucene.analysis.ko.dict.CharacterDefinition; - -public class UnknownDictionaryBuilder { - private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*"; - - private String encoding = "utf-8"; - - public UnknownDictionaryBuilder(String encoding) { - this.encoding = encoding; - } - - public UnknownDictionaryWriter build(String dirname) throws IOException { - UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file - readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary); - return unkDictionary; - } - - public UnknownDictionaryWriter readDictionaryFile(String filename) - throws IOException { - return readDictionaryFile(filename, encoding); - } - - public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding) - throws IOException { - UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); - - FileInputStream inputStream = new FileInputStream(filename); - Charset cs = Charset.forName(encoding); - CharsetDecoder decoder = cs.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - InputStreamReader streamReader = new InputStreamReader(inputStream, decoder); - LineNumberReader lineReader = new LineNumberReader(streamReader); - - dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY)); - - List lines = new ArrayList<>(); - String line = null; - while ((line = lineReader.readLine()) != null) { - // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, - // even though the unknown dictionary returns hardcoded null here. - final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry - lines.add(parsed); - } - - Collections.sort(lines, new Comparator() { - public int compare(String[] left, String[] right) { - int leftId = CharacterDefinition.lookupCharacterClass(left[0]); - int rightId = CharacterDefinition.lookupCharacterClass(right[0]); - return leftId - rightId; - } - }); - - for (String[] entry : lines) { - dictionary.put(entry); - } - - return dictionary; - } - - public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException { - FileInputStream inputStream = new FileInputStream(filename); - InputStreamReader streamReader = new InputStreamReader(inputStream, encoding); - LineNumberReader lineReader = new LineNumberReader(streamReader); - - String line = null; - - while ((line = lineReader.readLine()) != null) { - line = line.replaceAll("^\\s", ""); - line = line.replaceAll("\\s*#.*", ""); - line = line.replaceAll("\\s+", " "); - - // Skip empty line or comment line - if(line.length() == 0) { - continue; - } - - if(line.startsWith("0x")) { // Category mapping - String[] values = line.split(" ", 2); // Split only first space - - if(!values[0].contains("..")) { - int cp = Integer.decode(values[0]).intValue(); - dictionary.putCharacterCategory(cp, values[1]); - } else { - String[] codePoints = values[0].split("\\.\\."); - int cpFrom = Integer.decode(codePoints[0]).intValue(); - int cpTo = Integer.decode(codePoints[1]).intValue(); - - for(int i = cpFrom; i <= cpTo; i++){ - dictionary.putCharacterCategory(i, values[1]); - } - } - } else { // Invoke definition - String[] values = line.split(" "); // Consecutive space is merged above - String characterClassName = values[0]; - int invoke = Integer.parseInt(values[1]); - int group = Integer.parseInt(values[2]); - int length = Integer.parseInt(values[3]); - dictionary.putInvokeDefinition(characterClassName, invoke, group, length); - } - } - } -} diff --git a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java deleted file mode 100644 index 492abea664b..00000000000 --- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.dict; - -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.nio.file.Path; - -import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryBuilder; -import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryWriter; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.LuceneTestCase; - -import static java.io.File.separatorChar; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme; - -/** - * Tests of TokenInfoDictionary build tools; run using ant test-tools - */ -public class TokenInfoDictionaryTest extends LuceneTestCase { - - public void testPut() throws Exception { - TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*", - // "large" id - "일반,5000,5000,3,NNG,*,*,*,*,*,*,*"); - IntsRef wordIdRef = new IntsRefBuilder().get(); - - dict.lookupWordIds(0, wordIdRef); - int wordId = wordIdRef.ints[wordIdRef.offset]; - assertEquals(1, dict.getLeftId(wordId)); - assertEquals(1, dict.getRightId(wordId)); - assertEquals(2, dict.getWordCost(wordId)); - - dict.lookupWordIds(1, wordIdRef); - wordId = wordIdRef.ints[wordIdRef.offset]; - assertEquals(5000, dict.getLeftId(wordId)); - assertEquals(5000, dict.getRightId(wordId)); - assertEquals(3, dict.getWordCost(wordId)); - } - - private TokenInfoDictionary newDictionary(String... entries) throws Exception { - Path dir = createTempDir(); - try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv")); - PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) { - for (String entry : entries) { - printer.println(entry); - } - } - TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder("utf-8", true); - TokenInfoDictionaryWriter writer = builder.build(dir.toString()); - writer.write(dir.toString()); - String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar); - // We must also load the other files (in BinaryDictionary) from the correct path - return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString()); - } - - public void testPutException() throws Exception { - // too few columns - expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*")); - // id too large - expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*")); - } -}