mirror of https://github.com/apache/lucene.git
LUCENE-8934: promote nori tools to main jar
This commit is contained in:
parent
254a17b3b0
commit
2c0d8996cf
|
@ -26,7 +26,6 @@
|
||||||
<!-- currently whether rat detects this as binary or not
|
<!-- currently whether rat detects this as binary or not
|
||||||
is platform dependent?! -->
|
is platform dependent?! -->
|
||||||
<property name="rat.excludes" value="**/*.txt,**/bocchan.utf-8"/>
|
<property name="rat.excludes" value="**/*.txt,**/bocchan.utf-8"/>
|
||||||
<property name="rat.additional-includes" value="src/tools/**"/>
|
|
||||||
|
|
||||||
<!-- we don't want to pull in ipadic/naist etc -->
|
<!-- we don't want to pull in ipadic/naist etc -->
|
||||||
<property name="ivy.default.configuration" value="default"/>
|
<property name="ivy.default.configuration" value="default"/>
|
||||||
|
@ -45,6 +44,9 @@
|
||||||
<available type="dir" file="${build.dir}/${dict.version}" property="mecab-ko.dict.available"/>
|
<available type="dir" file="${build.dir}/${dict.version}" property="mecab-ko.dict.available"/>
|
||||||
|
|
||||||
<path id="classpath">
|
<path id="classpath">
|
||||||
|
<dirset dir="${build.dir}">
|
||||||
|
<include name="classes/java"/>
|
||||||
|
</dirset>
|
||||||
<pathelement path="${analyzers-common.jar}"/>
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
<path refid="base.classpath"/>
|
<path refid="base.classpath"/>
|
||||||
</path>
|
</path>
|
||||||
|
@ -57,28 +59,14 @@
|
||||||
<untar src="${build.dir}/${dict.version}.tar" dest="${build.dir}"/>
|
<untar src="${build.dir}/${dict.version}.tar" dest="${build.dir}"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<path id="tools.classpath">
|
<target name="build-dict" depends="compile, download-dict">
|
||||||
<path refid="classpath"/>
|
|
||||||
<pathelement location="${build.dir}/classes/java"/>
|
|
||||||
<pathelement location="${build.dir}/classes/tools"/>
|
|
||||||
</path>
|
|
||||||
|
|
||||||
<path id="tools.test.classpath">
|
|
||||||
<path refid="tools.classpath"/>
|
|
||||||
<path refid="test.base.classpath"/>
|
|
||||||
<pathelement location="${build.dir}/classes/tools-test"/>
|
|
||||||
</path>
|
|
||||||
|
|
||||||
<target name="build-dict" depends="compile-tools, download-dict">
|
|
||||||
<sequential>
|
<sequential>
|
||||||
<delete verbose="true">
|
<delete verbose="true">
|
||||||
<fileset dir="${resources.dir}/org/apache/lucene/analysis/ko/dict" includes="**/*"/>
|
<fileset dir="${resources.dir}/org/apache/lucene/analysis/ko/dict" includes="**/*"/>
|
||||||
</delete>
|
</delete>
|
||||||
<!-- TODO: optimize the dictionary construction a bit so that you don't need 1G -->
|
<!-- TODO: optimize the dictionary construction a bit so that you don't need 1G -->
|
||||||
<java fork="true" failonerror="true" maxmemory="1g" classname="org.apache.lucene.analysis.ko.util.DictionaryBuilder">
|
<java fork="true" failonerror="true" maxmemory="1g" classname="org.apache.lucene.analysis.ko.util.DictionaryBuilder">
|
||||||
<classpath>
|
<classpath refid="classpath"/>
|
||||||
<path refid="tools.classpath"/>
|
|
||||||
</classpath>
|
|
||||||
<assertions>
|
<assertions>
|
||||||
<enable package="org.apache.lucene"/>
|
<enable package="org.apache.lucene"/>
|
||||||
</assertions>
|
</assertions>
|
||||||
|
@ -90,34 +78,7 @@
|
||||||
</sequential>
|
</sequential>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="compile-tools" depends="compile-core, common.compile-tools">
|
<target name="compile-test" depends="module-build.compile-test"/>
|
||||||
<compile
|
|
||||||
srcdir="src/tools/java"
|
|
||||||
destdir="${build.dir}/classes/tools">
|
|
||||||
<classpath>
|
|
||||||
<path refid="tools.classpath"/>
|
|
||||||
</classpath>
|
|
||||||
</compile>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="compile-tools-tests" depends="compile-tools">
|
|
||||||
<compile
|
|
||||||
srcdir="src/tools/test"
|
|
||||||
destdir="${build.dir}/classes/tools-test">
|
|
||||||
<classpath>
|
|
||||||
<path refid="tools.test.classpath"/>
|
|
||||||
<pathelement path="src/tools/test"/>
|
|
||||||
</classpath>
|
|
||||||
</compile>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
|
|
||||||
<test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
|
|
||||||
<!-- TODO: not until we properly make 'test-tools' work with clover etc
|
|
||||||
<target name="test" depends="module-build.test, test-tools"/> -->
|
|
||||||
|
|
||||||
<target name="regenerate" depends="build-dict"/>
|
<target name="regenerate" depends="build-dict"/>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.InputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.store.InputStreamDataInput;
|
import org.apache.lucene.store.InputStreamDataInput;
|
||||||
import org.apache.lucene.util.IOUtils;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
|
@ -46,20 +45,9 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
||||||
*/
|
*/
|
||||||
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
|
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
|
||||||
super(resourceScheme, resourcePath);
|
super(resourceScheme, resourcePath);
|
||||||
InputStream is = null;
|
|
||||||
FST<Long> fst;
|
FST<Long> fst;
|
||||||
boolean success = false;
|
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
|
||||||
try {
|
|
||||||
is = getResource(FST_FILENAME_SUFFIX);
|
|
||||||
is = new BufferedInputStream(is);
|
|
||||||
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
|
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
|
||||||
success = true;
|
|
||||||
} finally {
|
|
||||||
if (success) {
|
|
||||||
IOUtils.close(is);
|
|
||||||
} else {
|
|
||||||
IOUtils.closeWhileHandlingException(is);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
this.fst = new TokenInfoFST(fst);
|
this.fst = new TokenInfoFST(fst);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,13 @@
|
||||||
package org.apache.lucene.analysis.ko.util;
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.Channels;
|
import java.nio.channels.Channels;
|
||||||
import java.nio.channels.WritableByteChannel;
|
import java.nio.channels.WritableByteChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -37,17 +37,17 @@ import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.dict.BinaryDictionary;
|
import org.apache.lucene.analysis.ko.dict.BinaryDictionary;
|
||||||
|
|
||||||
public abstract class BinaryDictionaryWriter {
|
abstract class BinaryDictionaryWriter {
|
||||||
private final static int ID_LIMIT = 8192;
|
private final static int ID_LIMIT = 8192;
|
||||||
|
|
||||||
protected final Class<? extends BinaryDictionary> implClazz;
|
private final Class<? extends BinaryDictionary> implClazz;
|
||||||
protected ByteBuffer buffer;
|
protected ByteBuffer buffer;
|
||||||
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
|
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
|
||||||
private int[] targetMap = new int[8192];
|
private int[] targetMap = new int[8192];
|
||||||
private int[] targetMapOffsets = new int[8192];
|
private int[] targetMapOffsets = new int[8192];
|
||||||
private final ArrayList<String> posDict = new ArrayList<>();
|
private final ArrayList<String> posDict = new ArrayList<>();
|
||||||
|
|
||||||
public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
|
BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
|
||||||
this.implClazz = implClazz;
|
this.implClazz = implClazz;
|
||||||
buffer = ByteBuffer.allocate(size);
|
buffer = ByteBuffer.allocate(size);
|
||||||
}
|
}
|
||||||
|
@ -183,7 +183,7 @@ public abstract class BinaryDictionaryWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addMapping(int sourceId, int wordId) {
|
void addMapping(int sourceId, int wordId) {
|
||||||
if (wordId <= lastWordId) {
|
if (wordId <= lastWordId) {
|
||||||
throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
|
throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
|
||||||
}
|
}
|
||||||
|
@ -205,27 +205,26 @@ public abstract class BinaryDictionaryWriter {
|
||||||
lastWordId = wordId;
|
lastWordId = wordId;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final String getBaseFileName(String baseDir) {
|
final String getBaseFileName() {
|
||||||
return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
|
return implClazz.getName().replace('.', '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write dictionary in file
|
* Write dictionary in file
|
||||||
* @throws IOException if an I/O error occurs writing the dictionary files
|
* @throws IOException if an I/O error occurs writing the dictionary files
|
||||||
*/
|
*/
|
||||||
public void write(String baseDir) throws IOException {
|
public void write(Path baseDir) throws IOException {
|
||||||
final String baseName = getBaseFileName(baseDir);
|
final String baseName = getBaseFileName();
|
||||||
writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
|
writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX));
|
||||||
writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
|
writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX));
|
||||||
writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
|
writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeTargetMap(String filename) throws IOException {
|
private void writeTargetMap(Path path) throws IOException {
|
||||||
new File(filename).getParentFile().mkdirs();
|
Files.createDirectories(path.getParent());
|
||||||
OutputStream os = new FileOutputStream(filename);
|
try (OutputStream os = Files.newOutputStream(path);
|
||||||
try {
|
OutputStream bos = new BufferedOutputStream(os)) {
|
||||||
os = new BufferedOutputStream(os);
|
final DataOutput out = new OutputStreamDataOutput(bos);
|
||||||
final DataOutput out = new OutputStreamDataOutput(os);
|
|
||||||
CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
|
CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
|
||||||
|
|
||||||
final int numSourceIds = lastSourceId + 1;
|
final int numSourceIds = lastSourceId + 1;
|
||||||
|
@ -246,17 +245,14 @@ public abstract class BinaryDictionaryWriter {
|
||||||
if (sourceId != numSourceIds) {
|
if (sourceId != numSourceIds) {
|
||||||
throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
|
throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writePosDict(String filename) throws IOException {
|
private void writePosDict(Path path) throws IOException {
|
||||||
new File(filename).getParentFile().mkdirs();
|
Files.createDirectories(path.getParent());
|
||||||
OutputStream os = new FileOutputStream(filename);
|
try (OutputStream os = Files.newOutputStream(path);
|
||||||
try {
|
OutputStream bos = new BufferedOutputStream(os)) {
|
||||||
os = new BufferedOutputStream(os);
|
final DataOutput out = new OutputStreamDataOutput(bos);
|
||||||
final DataOutput out = new OutputStreamDataOutput(os);
|
|
||||||
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
|
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
|
||||||
out.writeVInt(posDict.size());
|
out.writeVInt(posDict.size());
|
||||||
for (String s : posDict) {
|
for (String s : posDict) {
|
||||||
|
@ -270,25 +266,21 @@ public abstract class BinaryDictionaryWriter {
|
||||||
out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal());
|
out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeDictionary(String filename) throws IOException {
|
private void writeDictionary(Path path) throws IOException {
|
||||||
new File(filename).getParentFile().mkdirs();
|
Files.createDirectories(path.getParent());
|
||||||
final FileOutputStream os = new FileOutputStream(filename);
|
try (OutputStream os = Files.newOutputStream(path);
|
||||||
try {
|
OutputStream bos = new BufferedOutputStream(os)) {
|
||||||
final DataOutput out = new OutputStreamDataOutput(os);
|
final DataOutput out = new OutputStreamDataOutput(bos);
|
||||||
CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
|
CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
|
||||||
out.writeVInt(buffer.position());
|
out.writeVInt(buffer.position());
|
||||||
final WritableByteChannel channel = Channels.newChannel(os);
|
final WritableByteChannel channel = Channels.newChannel(bos);
|
||||||
// Write Buffer
|
// Write Buffer
|
||||||
buffer.flip(); // set position to 0, set limit to current position
|
buffer.flip(); // set position to 0, set limit to current position
|
||||||
channel.write(buffer);
|
channel.write(buffer);
|
||||||
assert buffer.remaining() == 0L;
|
assert buffer.remaining() == 0L;
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -17,10 +17,10 @@
|
||||||
package org.apache.lucene.analysis.ko.util;
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
||||||
|
@ -29,7 +29,7 @@ import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.store.DataOutput;
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||||
|
|
||||||
public final class CharacterDefinitionWriter {
|
final class CharacterDefinitionWriter {
|
||||||
|
|
||||||
private final byte[] characterCategoryMap = new byte[0x10000];
|
private final byte[] characterCategoryMap = new byte[0x10000];
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ public final class CharacterDefinitionWriter {
|
||||||
/**
|
/**
|
||||||
* Constructor for building. TODO: remove write access
|
* Constructor for building. TODO: remove write access
|
||||||
*/
|
*/
|
||||||
public CharacterDefinitionWriter() {
|
CharacterDefinitionWriter() {
|
||||||
Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
|
Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ public final class CharacterDefinitionWriter {
|
||||||
* code point
|
* code point
|
||||||
* @param characterClassName character class name
|
* @param characterClassName character class name
|
||||||
*/
|
*/
|
||||||
public void putCharacterCategory(int codePoint, String characterClassName) {
|
void putCharacterCategory(int codePoint, String characterClassName) {
|
||||||
characterClassName = characterClassName.split(" ")[0]; // use first
|
characterClassName = characterClassName.split(" ")[0]; // use first
|
||||||
// category
|
// category
|
||||||
// class
|
// class
|
||||||
|
@ -62,20 +62,17 @@ public final class CharacterDefinitionWriter {
|
||||||
characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
|
characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
|
void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
|
||||||
final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
|
final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
|
||||||
invokeMap[characterClass] = invoke == 1;
|
invokeMap[characterClass] = invoke == 1;
|
||||||
groupMap[characterClass] = group == 1;
|
groupMap[characterClass] = group == 1;
|
||||||
// TODO: length def ignored
|
// TODO: length def ignored
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(String baseDir) throws IOException {
|
public void write(Path baseDir) throws IOException {
|
||||||
String filename = baseDir + File.separator +
|
Path path = baseDir.resolve(CharacterDefinition.class.getName().replace('.', '/') + CharacterDefinition.FILENAME_SUFFIX);
|
||||||
CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX;
|
Files.createDirectories(path.getParent());
|
||||||
new File(filename).getParentFile().mkdirs();
|
try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))){
|
||||||
OutputStream os = new FileOutputStream(filename);
|
|
||||||
try {
|
|
||||||
os = new BufferedOutputStream(os);
|
|
||||||
final DataOutput out = new OutputStreamDataOutput(os);
|
final DataOutput out = new OutputStreamDataOutput(os);
|
||||||
CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
|
CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
|
||||||
out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
|
out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
|
||||||
|
@ -86,8 +83,6 @@ public final class CharacterDefinitionWriter {
|
||||||
);
|
);
|
||||||
out.writeByte(b);
|
out.writeByte(b);
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.LineNumberReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
class ConnectionCostsBuilder {
|
||||||
|
|
||||||
|
private ConnectionCostsBuilder() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ConnectionCostsWriter build(Path path) throws IOException {
|
||||||
|
try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII);
|
||||||
|
LineNumberReader lineReader = new LineNumberReader(reader)) {
|
||||||
|
|
||||||
|
String line = lineReader.readLine();
|
||||||
|
String[] dimensions = line.split("\\s+");
|
||||||
|
|
||||||
|
assert dimensions.length == 2;
|
||||||
|
|
||||||
|
int forwardSize = Integer.parseInt(dimensions[0]);
|
||||||
|
int backwardSize = Integer.parseInt(dimensions[1]);
|
||||||
|
|
||||||
|
assert forwardSize > 0 && backwardSize > 0;
|
||||||
|
|
||||||
|
ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
|
||||||
|
|
||||||
|
while ((line = lineReader.readLine()) != null) {
|
||||||
|
String[] fields = line.split("\\s+");
|
||||||
|
|
||||||
|
assert fields.length == 3;
|
||||||
|
|
||||||
|
int forwardId = Integer.parseInt(fields[0]);
|
||||||
|
int backwardId = Integer.parseInt(fields[1]);
|
||||||
|
int cost = Integer.parseInt(fields[2]);
|
||||||
|
|
||||||
|
costs.add(forwardId, backwardId, cost);
|
||||||
|
}
|
||||||
|
return costs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,10 +17,10 @@
|
||||||
package org.apache.lucene.analysis.ko.util;
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
|
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.store.DataOutput;
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||||
|
|
||||||
public final class ConnectionCostsWriter {
|
final class ConnectionCostsWriter {
|
||||||
|
|
||||||
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
|
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
|
||||||
private final int forwardSize;
|
private final int forwardSize;
|
||||||
|
@ -36,7 +36,7 @@ public final class ConnectionCostsWriter {
|
||||||
/**
|
/**
|
||||||
* Constructor for building. TODO: remove write access
|
* Constructor for building. TODO: remove write access
|
||||||
*/
|
*/
|
||||||
public ConnectionCostsWriter(int forwardSize, int backwardSize) {
|
ConnectionCostsWriter(int forwardSize, int backwardSize) {
|
||||||
this.forwardSize = forwardSize;
|
this.forwardSize = forwardSize;
|
||||||
this.backwardSize = backwardSize;
|
this.backwardSize = backwardSize;
|
||||||
this.costs = new short[backwardSize][forwardSize];
|
this.costs = new short[backwardSize][forwardSize];
|
||||||
|
@ -46,14 +46,12 @@ public final class ConnectionCostsWriter {
|
||||||
this.costs[backwardId][forwardId] = (short)cost;
|
this.costs[backwardId][forwardId] = (short)cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(String baseDir) throws IOException {
|
public void write(Path baseDir) throws IOException {
|
||||||
String filename = baseDir + File.separator +
|
Files.createDirectories(baseDir);
|
||||||
ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX;
|
String fileName = ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX;
|
||||||
new File(filename).getParentFile().mkdirs();
|
try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName));
|
||||||
OutputStream os = new FileOutputStream(filename);
|
OutputStream bos = new BufferedOutputStream(os)) {
|
||||||
try {
|
final DataOutput out = new OutputStreamDataOutput(bos);
|
||||||
os = new BufferedOutputStream(os);
|
|
||||||
final DataOutput out = new OutputStreamDataOutput(os);
|
|
||||||
CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
|
CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
|
||||||
out.writeVInt(forwardSize);
|
out.writeVInt(forwardSize);
|
||||||
out.writeVInt(backwardSize);
|
out.writeVInt(backwardSize);
|
||||||
|
@ -61,14 +59,12 @@ public final class ConnectionCostsWriter {
|
||||||
assert costs.length == backwardSize;
|
assert costs.length == backwardSize;
|
||||||
for (short[] a : costs) {
|
for (short[] a : costs) {
|
||||||
assert a.length == forwardSize;
|
assert a.length == forwardSize;
|
||||||
for (int i = 0; i < a.length; i++) {
|
for (short cost : a) {
|
||||||
int delta = (int)a[i] - last;
|
int delta = (int) cost - last;
|
||||||
out.writeZInt(delta);
|
out.writeZInt(delta);
|
||||||
last = a[i];
|
last = cost;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
os.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tool to build dictionaries.
|
||||||
|
*/
|
||||||
|
public class DictionaryBuilder {
|
||||||
|
|
||||||
|
private DictionaryBuilder() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void build(Path inputDir, Path outputDir, String encoding, boolean normalizeEntry) throws IOException {
|
||||||
|
// Build TokenInfo Dictionary
|
||||||
|
new TokenInfoDictionaryBuilder(encoding, normalizeEntry)
|
||||||
|
.build(inputDir)
|
||||||
|
.write(outputDir);
|
||||||
|
|
||||||
|
// Build Unknown Word Dictionary
|
||||||
|
new UnknownDictionaryBuilder(encoding)
|
||||||
|
.build(inputDir)
|
||||||
|
.write(outputDir);
|
||||||
|
|
||||||
|
// Build Connection Cost
|
||||||
|
ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
|
||||||
|
.write(outputDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
String inputDirname = args[0];
|
||||||
|
String outputDirname = args[1];
|
||||||
|
String inputEncoding = args[2];
|
||||||
|
boolean normalizeEntries = Boolean.parseBoolean(args[3]);
|
||||||
|
DictionaryBuilder.build(Paths.get(inputDirname), Paths.get(outputDirname), inputEncoding, normalizeEntries);
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,20 +17,17 @@
|
||||||
package org.apache.lucene.analysis.ko.util;
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.FilenameFilter;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.file.Files;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.file.Path;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.Builder;
|
||||||
|
@ -38,72 +35,59 @@ import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
public class TokenInfoDictionaryBuilder {
|
class TokenInfoDictionaryBuilder {
|
||||||
|
|
||||||
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
|
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
|
||||||
private int offset = 0;
|
private int offset = 0;
|
||||||
|
|
||||||
private String encoding = "utf-8";
|
private String encoding;
|
||||||
|
|
||||||
private Normalizer.Form normalForm;
|
private Normalizer.Form normalForm;
|
||||||
|
|
||||||
public TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
|
TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
|
||||||
this.encoding = encoding;
|
this.encoding = encoding;
|
||||||
this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
|
normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenInfoDictionaryWriter build(String dirname) throws IOException {
|
public TokenInfoDictionaryWriter build(Path dir) throws IOException {
|
||||||
FilenameFilter filter = (dir, name) -> name.endsWith(".csv");
|
try (Stream<Path> files = Files.list(dir)) {
|
||||||
ArrayList<File> csvFiles = new ArrayList<>();
|
List<Path> csvFiles = files
|
||||||
for (File file : new File(dirname).listFiles(filter)) {
|
.filter(path -> path.getFileName().toString().endsWith(".csv"))
|
||||||
csvFiles.add(file);
|
.sorted()
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
return buildDictionary(csvFiles);
|
||||||
}
|
}
|
||||||
Collections.sort(csvFiles);
|
|
||||||
return buildDictionary(csvFiles);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
|
private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
|
||||||
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
|
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
|
||||||
|
|
||||||
// all lines in the file
|
// all lines in the file
|
||||||
System.out.println(" parse...");
|
|
||||||
List<String[]> lines = new ArrayList<>(400000);
|
List<String[]> lines = new ArrayList<>(400000);
|
||||||
for (File file : csvFiles){
|
for (Path path : csvFiles) {
|
||||||
FileInputStream inputStream = new FileInputStream(file);
|
try (BufferedReader reader = Files.newBufferedReader(path, Charset.forName(encoding))) {
|
||||||
Charset cs = Charset.forName(encoding);
|
String line;
|
||||||
CharsetDecoder decoder = cs.newDecoder()
|
while ((line = reader.readLine()) != null) {
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
String[] entry = CSVUtil.parse(line);
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
|
||||||
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
|
|
||||||
BufferedReader reader = new BufferedReader(streamReader);
|
|
||||||
|
|
||||||
String line = null;
|
|
||||||
while ((line = reader.readLine()) != null) {
|
|
||||||
String[] entry = CSVUtil.parse(line);
|
|
||||||
|
|
||||||
if(entry.length < 12) {
|
if (entry.length < 12) {
|
||||||
throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
|
throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
|
||||||
}
|
}
|
||||||
|
|
||||||
// NFKC normalize dictionary entry
|
// NFKC normalize dictionary entry
|
||||||
if (normalForm != null) {
|
if (normalForm != null) {
|
||||||
String[] normalizedEntry = new String[entry.length];
|
String[] normalizedEntry = new String[entry.length];
|
||||||
for (int i = 0; i < entry.length; i++) {
|
for (int i = 0; i < entry.length; i++) {
|
||||||
normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
|
normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
|
||||||
|
}
|
||||||
|
lines.add(normalizedEntry);
|
||||||
|
} else {
|
||||||
|
lines.add(entry);
|
||||||
}
|
}
|
||||||
lines.add(normalizedEntry);
|
|
||||||
} else {
|
|
||||||
lines.add(entry);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.println(" sort...");
|
|
||||||
|
|
||||||
// sort by term: we sorted the files already and use a stable sort.
|
// sort by term: we sorted the files already and use a stable sort.
|
||||||
Collections.sort(lines, Comparator.comparing(left -> left[0]));
|
lines.sort(Comparator.comparing(left -> left[0]));
|
||||||
|
|
||||||
System.out.println(" encode...");
|
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
||||||
|
@ -111,7 +95,7 @@ public class TokenInfoDictionaryBuilder {
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
|
||||||
// build tokeninfo dictionary
|
// build token info dictionary
|
||||||
for (String[] entry : lines) {
|
for (String[] entry : lines) {
|
||||||
String surfaceForm = entry[0].trim();
|
String surfaceForm = entry[0].trim();
|
||||||
if (surfaceForm.isEmpty()) {
|
if (surfaceForm.isEmpty()) {
|
||||||
|
@ -119,9 +103,8 @@ public class TokenInfoDictionaryBuilder {
|
||||||
}
|
}
|
||||||
int next = dictionary.put(entry);
|
int next = dictionary.put(entry);
|
||||||
|
|
||||||
if(next == offset){
|
if(next == offset) {
|
||||||
System.out.println("Failed to process line: " + Arrays.toString(entry));
|
throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!surfaceForm.equals(lastValue)) {
|
if (!surfaceForm.equals(lastValue)) {
|
||||||
|
@ -135,16 +118,10 @@ public class TokenInfoDictionaryBuilder {
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstBuilder.add(scratch.get(), ord);
|
||||||
}
|
}
|
||||||
dictionary.addMapping((int)ord, offset);
|
dictionary.addMapping((int) ord, offset);
|
||||||
offset = next;
|
offset = next;
|
||||||
}
|
}
|
||||||
|
dictionary.setFST(fstBuilder.finish());
|
||||||
final FST<Long> fst = fstBuilder.finish();
|
|
||||||
|
|
||||||
System.out.print(" " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes... ");
|
|
||||||
dictionary.setFST(fst);
|
|
||||||
System.out.println(" done");
|
|
||||||
|
|
||||||
return dictionary;
|
return dictionary;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -19,31 +19,31 @@ package org.apache.lucene.analysis.ko.util;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.util.Objects;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
|
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
|
class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
|
||||||
private FST<Long> fst;
|
private FST<Long> fst;
|
||||||
|
|
||||||
public TokenInfoDictionaryWriter(int size) {
|
TokenInfoDictionaryWriter(int size) {
|
||||||
super(TokenInfoDictionary.class, size);
|
super(TokenInfoDictionary.class, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFST(FST<Long> fst) {
|
public void setFST(FST<Long> fst) {
|
||||||
|
Objects.requireNonNull(fst, "dictionary must not be empty");
|
||||||
this.fst = fst;
|
this.fst = fst;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(String baseDir) throws IOException {
|
public void write(Path baseDir) throws IOException {
|
||||||
super.write(baseDir);
|
super.write(baseDir);
|
||||||
writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX);
|
writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void writeFST(String filename) throws IOException {
|
private void writeFST(Path path) throws IOException {
|
||||||
Path p = Paths.get(filename);
|
Files.createDirectories(path.getParent());
|
||||||
Files.createDirectories(p.getParent());
|
fst.save(path);
|
||||||
fst.save(p);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,118 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.LineNumberReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
||||||
|
|
||||||
|
class UnknownDictionaryBuilder {
|
||||||
|
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
|
||||||
|
|
||||||
|
private String encoding;
|
||||||
|
|
||||||
|
UnknownDictionaryBuilder(String encoding) {
|
||||||
|
this.encoding = encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
public UnknownDictionaryWriter build(Path dir) throws IOException {
|
||||||
|
UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def")); //Should be only one file
|
||||||
|
readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
|
||||||
|
return unkDictionary;
|
||||||
|
}
|
||||||
|
|
||||||
|
private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
|
||||||
|
return readDictionaryFile(path, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
|
||||||
|
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
|
||||||
|
|
||||||
|
List<String[]> lines = new ArrayList<>();
|
||||||
|
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
|
||||||
|
LineNumberReader lineReader = new LineNumberReader(reader)) {
|
||||||
|
|
||||||
|
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
|
||||||
|
|
||||||
|
String line;
|
||||||
|
while ((line = lineReader.readLine()) != null) {
|
||||||
|
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
|
||||||
|
// even though the unknown dictionary returns hardcoded null here.
|
||||||
|
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
|
||||||
|
lines.add(parsed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));
|
||||||
|
|
||||||
|
for (String[] entry : lines) {
|
||||||
|
dictionary.put(entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
return dictionary;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
|
||||||
|
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
|
||||||
|
LineNumberReader lineReader = new LineNumberReader(reader)) {
|
||||||
|
|
||||||
|
String line;
|
||||||
|
while ((line = lineReader.readLine()) != null) {
|
||||||
|
line = line.replaceAll("^\\s", "");
|
||||||
|
line = line.replaceAll("\\s*#.*", "");
|
||||||
|
line = line.replaceAll("\\s+", " ");
|
||||||
|
|
||||||
|
// Skip empty line or comment line
|
||||||
|
if (line.length() == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (line.startsWith("0x")) { // Category mapping
|
||||||
|
String[] values = line.split(" ", 2); // Split only first space
|
||||||
|
|
||||||
|
if (!values[0].contains("..")) {
|
||||||
|
int cp = Integer.decode(values[0]);
|
||||||
|
dictionary.putCharacterCategory(cp, values[1]);
|
||||||
|
} else {
|
||||||
|
String[] codePoints = values[0].split("\\.\\.");
|
||||||
|
int cpFrom = Integer.decode(codePoints[0]);
|
||||||
|
int cpTo = Integer.decode(codePoints[1]);
|
||||||
|
|
||||||
|
for (int i = cpFrom; i <= cpTo; i++) {
|
||||||
|
dictionary.putCharacterCategory(i, values[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else { // Invoke definition
|
||||||
|
String[] values = line.split(" "); // Consecutive space is merged above
|
||||||
|
String characterClassName = values[0];
|
||||||
|
int invoke = Integer.parseInt(values[1]);
|
||||||
|
int group = Integer.parseInt(values[2]);
|
||||||
|
int length = Integer.parseInt(values[3]);
|
||||||
|
dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,11 +17,12 @@
|
||||||
package org.apache.lucene.analysis.ko.util;
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
||||||
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
|
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
|
||||||
|
|
||||||
public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
|
class UnknownDictionaryWriter extends BinaryDictionaryWriter {
|
||||||
|
|
||||||
private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter();
|
private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter();
|
||||||
|
|
||||||
|
@ -58,7 +59,7 @@ public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(String baseDir) throws IOException {
|
public void write(Path baseDir) throws IOException {
|
||||||
super.write(baseDir);
|
super.write(baseDir);
|
||||||
characterDefinition.write(baseDir);
|
characterDefinition.write(baseDir);
|
||||||
}
|
}
|
Binary file not shown.
|
@ -16,15 +16,74 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.ko.dict;
|
package org.apache.lucene.analysis.ko.dict;
|
||||||
|
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.POS;
|
import org.apache.lucene.analysis.ko.POS;
|
||||||
|
import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.IntsRefFSTEnum;
|
import org.apache.lucene.util.fst.IntsRefFSTEnum;
|
||||||
import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
|
|
||||||
|
|
||||||
public class TestTokenInfoDictionary extends LuceneTestCase {
|
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests of TokenInfoDictionary build tools; run using ant test-tools
|
||||||
|
*/
|
||||||
|
public class TokenInfoDictionaryTest extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testPut() throws Exception {
|
||||||
|
TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
|
||||||
|
// "large" id
|
||||||
|
"일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
|
||||||
|
IntsRef wordIdRef = new IntsRefBuilder().get();
|
||||||
|
|
||||||
|
dict.lookupWordIds(0, wordIdRef);
|
||||||
|
int wordId = wordIdRef.ints[wordIdRef.offset];
|
||||||
|
assertEquals(1, dict.getLeftId(wordId));
|
||||||
|
assertEquals(1, dict.getRightId(wordId));
|
||||||
|
assertEquals(2, dict.getWordCost(wordId));
|
||||||
|
|
||||||
|
dict.lookupWordIds(1, wordIdRef);
|
||||||
|
wordId = wordIdRef.ints[wordIdRef.offset];
|
||||||
|
assertEquals(5000, dict.getLeftId(wordId));
|
||||||
|
assertEquals(5000, dict.getRightId(wordId));
|
||||||
|
assertEquals(3, dict.getWordCost(wordId));
|
||||||
|
}
|
||||||
|
|
||||||
|
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
|
||||||
|
Path dir = createTempDir();
|
||||||
|
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
|
||||||
|
PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
|
||||||
|
for (String entry : entries) {
|
||||||
|
printer.println(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Files.createFile(dir.resolve("unk.def"));
|
||||||
|
Files.createFile(dir.resolve("char.def"));
|
||||||
|
try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def"));
|
||||||
|
PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
|
||||||
|
printer.println("1 1");
|
||||||
|
}
|
||||||
|
DictionaryBuilder.build(dir, dir, "utf-8", true);
|
||||||
|
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
|
||||||
|
// We must also load the other files (in BinaryDictionary) from the correct path
|
||||||
|
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPutException() {
|
||||||
|
//too few columns
|
||||||
|
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
|
||||||
|
// id too large
|
||||||
|
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
|
||||||
|
}
|
||||||
|
|
||||||
/** enumerates the entire FST/lookup data and just does basic sanity checks */
|
/** enumerates the entire FST/lookup data and just does basic sanity checks */
|
||||||
public void testEnumerateAll() throws Exception {
|
public void testEnumerateAll() throws Exception {
|
||||||
|
@ -38,12 +97,12 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
||||||
ConnectionCosts matrix = ConnectionCosts.getInstance();
|
ConnectionCosts matrix = ConnectionCosts.getInstance();
|
||||||
FST<Long> fst = tid.getFST().getInternalFST();
|
FST<Long> fst = tid.getFST().getInternalFST();
|
||||||
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
|
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
|
||||||
InputOutput<Long> mapping;
|
IntsRefFSTEnum.InputOutput<Long> mapping;
|
||||||
IntsRef scratch = new IntsRef();
|
IntsRef scratch = new IntsRef();
|
||||||
while ((mapping = fstEnum.next()) != null) {
|
while ((mapping = fstEnum.next()) != null) {
|
||||||
numTerms++;
|
numTerms++;
|
||||||
IntsRef input = mapping.input;
|
IntsRef input = mapping.input;
|
||||||
char chars[] = new char[input.length];
|
char[] chars = new char[input.length];
|
||||||
for (int i = 0; i < chars.length; i++) {
|
for (int i = 0; i < chars.length; i++) {
|
||||||
chars[i] = (char)input.ints[input.offset+i];
|
chars[i] = (char)input.ints[input.offset+i];
|
||||||
}
|
}
|
||||||
|
@ -51,7 +110,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
||||||
assertFalse(surfaceForm.isEmpty());
|
assertFalse(surfaceForm.isEmpty());
|
||||||
assertEquals(surfaceForm.trim(), surfaceForm);
|
assertEquals(surfaceForm.trim(), surfaceForm);
|
||||||
assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
|
assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
|
||||||
|
|
||||||
Long output = mapping.output;
|
Long output = mapping.output;
|
||||||
int sourceId = output.intValue();
|
int sourceId = output.intValue();
|
||||||
// we walk in order, terms, sourceIds, and wordIds should always be increasing
|
// we walk in order, terms, sourceIds, and wordIds should always be increasing
|
|
@ -14,11 +14,8 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.ko.dict;
|
package org.apache.lucene.analysis.ko.util;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.util.CSVUtil;
|
|
||||||
import org.apache.lucene.analysis.ko.util.UnknownDictionaryWriter;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
|
@ -1,67 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.analysis.ko.util;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.LineNumberReader;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.CharsetDecoder;
|
|
||||||
import java.nio.charset.CodingErrorAction;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
|
|
||||||
public class ConnectionCostsBuilder {
|
|
||||||
|
|
||||||
private ConnectionCostsBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public static ConnectionCostsWriter build(String filename) throws IOException {
|
|
||||||
FileInputStream inputStream = new FileInputStream(filename);
|
|
||||||
Charset cs = StandardCharsets.US_ASCII;
|
|
||||||
CharsetDecoder decoder = cs.newDecoder()
|
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
|
||||||
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
|
|
||||||
LineNumberReader lineReader = new LineNumberReader(streamReader);
|
|
||||||
|
|
||||||
String line = lineReader.readLine();
|
|
||||||
String[] dimensions = line.split("\\s+");
|
|
||||||
|
|
||||||
assert dimensions.length == 2;
|
|
||||||
|
|
||||||
int forwardSize = Integer.parseInt(dimensions[0]);
|
|
||||||
int backwardSize = Integer.parseInt(dimensions[1]);
|
|
||||||
|
|
||||||
assert forwardSize > 0 && backwardSize > 0;
|
|
||||||
|
|
||||||
ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
|
|
||||||
|
|
||||||
while ((line = lineReader.readLine()) != null) {
|
|
||||||
String[] fields = line.split("\\s+");
|
|
||||||
|
|
||||||
assert fields.length == 3;
|
|
||||||
|
|
||||||
int forwardId = Integer.parseInt(fields[0]);
|
|
||||||
int backwardId = Integer.parseInt(fields[1]);
|
|
||||||
int cost = Integer.parseInt(fields[2]);
|
|
||||||
|
|
||||||
costs.add(forwardId, backwardId, cost);
|
|
||||||
}
|
|
||||||
return costs;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,67 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.analysis.ko.util;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class DictionaryBuilder {
|
|
||||||
|
|
||||||
private DictionaryBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void build(String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException {
|
|
||||||
System.out.println("building tokeninfo dict...");
|
|
||||||
TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(encoding, normalizeEntry);
|
|
||||||
TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
|
|
||||||
tokenInfoDictionary.write(outputDirname);
|
|
||||||
tokenInfoDictionary = null;
|
|
||||||
tokenInfoBuilder = null;
|
|
||||||
System.out.println("done");
|
|
||||||
|
|
||||||
System.out.print("building unknown word dict...");
|
|
||||||
UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
|
|
||||||
UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
|
|
||||||
unkDictionary.write(outputDirname);
|
|
||||||
unkDictionary = null;
|
|
||||||
unkBuilder = null;
|
|
||||||
System.out.println("done");
|
|
||||||
|
|
||||||
System.out.print("building connection costs...");
|
|
||||||
ConnectionCostsWriter connectionCosts
|
|
||||||
= ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
|
|
||||||
connectionCosts.write(outputDirname);
|
|
||||||
System.out.println("done");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException {
|
|
||||||
String inputDirname = args[0];
|
|
||||||
String outputDirname = args[1];
|
|
||||||
String inputEncoding = args[2];
|
|
||||||
boolean normalizeEntries = Boolean.parseBoolean(args[3]);
|
|
||||||
|
|
||||||
System.out.println("dictionary builder");
|
|
||||||
System.out.println("");
|
|
||||||
System.out.println("input directory: " + inputDirname);
|
|
||||||
System.out.println("output directory: " + outputDirname);
|
|
||||||
System.out.println("input encoding: " + inputEncoding);
|
|
||||||
System.out.println("normalize entries: " + normalizeEntries);
|
|
||||||
System.out.println("");
|
|
||||||
DictionaryBuilder.build(inputDirname, outputDirname, inputEncoding, normalizeEntries);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,134 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.analysis.ko.util;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.LineNumberReader;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.CharsetDecoder;
|
|
||||||
import java.nio.charset.CodingErrorAction;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
|
|
||||||
|
|
||||||
public class UnknownDictionaryBuilder {
|
|
||||||
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
|
|
||||||
|
|
||||||
private String encoding = "utf-8";
|
|
||||||
|
|
||||||
public UnknownDictionaryBuilder(String encoding) {
|
|
||||||
this.encoding = encoding;
|
|
||||||
}
|
|
||||||
|
|
||||||
public UnknownDictionaryWriter build(String dirname) throws IOException {
|
|
||||||
UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
|
|
||||||
readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
|
|
||||||
return unkDictionary;
|
|
||||||
}
|
|
||||||
|
|
||||||
public UnknownDictionaryWriter readDictionaryFile(String filename)
|
|
||||||
throws IOException {
|
|
||||||
return readDictionaryFile(filename, encoding);
|
|
||||||
}
|
|
||||||
|
|
||||||
public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
|
|
||||||
throws IOException {
|
|
||||||
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
|
|
||||||
|
|
||||||
FileInputStream inputStream = new FileInputStream(filename);
|
|
||||||
Charset cs = Charset.forName(encoding);
|
|
||||||
CharsetDecoder decoder = cs.newDecoder()
|
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
|
||||||
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
|
|
||||||
LineNumberReader lineReader = new LineNumberReader(streamReader);
|
|
||||||
|
|
||||||
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
|
|
||||||
|
|
||||||
List<String[]> lines = new ArrayList<>();
|
|
||||||
String line = null;
|
|
||||||
while ((line = lineReader.readLine()) != null) {
|
|
||||||
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
|
|
||||||
// even though the unknown dictionary returns hardcoded null here.
|
|
||||||
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
|
|
||||||
lines.add(parsed);
|
|
||||||
}
|
|
||||||
|
|
||||||
Collections.sort(lines, new Comparator<String[]>() {
|
|
||||||
public int compare(String[] left, String[] right) {
|
|
||||||
int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
|
|
||||||
int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
|
|
||||||
return leftId - rightId;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (String[] entry : lines) {
|
|
||||||
dictionary.put(entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return dictionary;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
|
|
||||||
FileInputStream inputStream = new FileInputStream(filename);
|
|
||||||
InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
|
|
||||||
LineNumberReader lineReader = new LineNumberReader(streamReader);
|
|
||||||
|
|
||||||
String line = null;
|
|
||||||
|
|
||||||
while ((line = lineReader.readLine()) != null) {
|
|
||||||
line = line.replaceAll("^\\s", "");
|
|
||||||
line = line.replaceAll("\\s*#.*", "");
|
|
||||||
line = line.replaceAll("\\s+", " ");
|
|
||||||
|
|
||||||
// Skip empty line or comment line
|
|
||||||
if(line.length() == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(line.startsWith("0x")) { // Category mapping
|
|
||||||
String[] values = line.split(" ", 2); // Split only first space
|
|
||||||
|
|
||||||
if(!values[0].contains("..")) {
|
|
||||||
int cp = Integer.decode(values[0]).intValue();
|
|
||||||
dictionary.putCharacterCategory(cp, values[1]);
|
|
||||||
} else {
|
|
||||||
String[] codePoints = values[0].split("\\.\\.");
|
|
||||||
int cpFrom = Integer.decode(codePoints[0]).intValue();
|
|
||||||
int cpTo = Integer.decode(codePoints[1]).intValue();
|
|
||||||
|
|
||||||
for(int i = cpFrom; i <= cpTo; i++){
|
|
||||||
dictionary.putCharacterCategory(i, values[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else { // Invoke definition
|
|
||||||
String[] values = line.split(" "); // Consecutive space is merged above
|
|
||||||
String characterClassName = values[0];
|
|
||||||
int invoke = Integer.parseInt(values[1]);
|
|
||||||
int group = Integer.parseInt(values[2]);
|
|
||||||
int length = Integer.parseInt(values[3]);
|
|
||||||
dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,80 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.analysis.ko.dict;
|
|
||||||
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.io.PrintWriter;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryBuilder;
|
|
||||||
import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryWriter;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
import static java.io.File.separatorChar;
|
|
||||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tests of TokenInfoDictionary build tools; run using ant test-tools
|
|
||||||
*/
|
|
||||||
public class TokenInfoDictionaryTest extends LuceneTestCase {
|
|
||||||
|
|
||||||
public void testPut() throws Exception {
|
|
||||||
TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
|
|
||||||
// "large" id
|
|
||||||
"일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
|
|
||||||
IntsRef wordIdRef = new IntsRefBuilder().get();
|
|
||||||
|
|
||||||
dict.lookupWordIds(0, wordIdRef);
|
|
||||||
int wordId = wordIdRef.ints[wordIdRef.offset];
|
|
||||||
assertEquals(1, dict.getLeftId(wordId));
|
|
||||||
assertEquals(1, dict.getRightId(wordId));
|
|
||||||
assertEquals(2, dict.getWordCost(wordId));
|
|
||||||
|
|
||||||
dict.lookupWordIds(1, wordIdRef);
|
|
||||||
wordId = wordIdRef.ints[wordIdRef.offset];
|
|
||||||
assertEquals(5000, dict.getLeftId(wordId));
|
|
||||||
assertEquals(5000, dict.getRightId(wordId));
|
|
||||||
assertEquals(3, dict.getWordCost(wordId));
|
|
||||||
}
|
|
||||||
|
|
||||||
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
|
|
||||||
Path dir = createTempDir();
|
|
||||||
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
|
|
||||||
PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
|
|
||||||
for (String entry : entries) {
|
|
||||||
printer.println(entry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder("utf-8", true);
|
|
||||||
TokenInfoDictionaryWriter writer = builder.build(dir.toString());
|
|
||||||
writer.write(dir.toString());
|
|
||||||
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
|
|
||||||
// We must also load the other files (in BinaryDictionary) from the correct path
|
|
||||||
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testPutException() throws Exception {
|
|
||||||
// too few columns
|
|
||||||
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
|
|
||||||
// id too large
|
|
||||||
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue