LUCENE-8871: promote kuromoji tools to main jar

This commit is contained in:
Michael Sokolov 2019-06-22 11:13:02 -04:00
parent a76c962ee6
commit 024e200bb9
18 changed files with 424 additions and 606 deletions

View File

@ -26,7 +26,6 @@
<!-- currently whether rat detects this as binary or not
is platform dependent?! -->
<property name="rat.excludes" value="**/*.txt,**/bocchan.utf-8"/>
<property name="rat.additional-includes" value="src/tools/**"/>
<!-- we don't want to pull in ipadic/naist etc -->
<property name="ivy.default.configuration" value="default"/>
@ -52,6 +51,9 @@
<available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
<path id="classpath">
<dirset dir="${build.dir}">
<include name="classes/java"/>
</dirset>
<pathelement path="${analyzers-common.jar}"/>
<path refid="base.classpath"/>
</path>
@ -69,28 +71,14 @@
originalfile="${dict.src.dir}/Noun.proper.csv"/>
</target>
<path id="tools.classpath">
<path refid="classpath"/>
<pathelement location="${build.dir}/classes/java"/>
<pathelement location="${build.dir}/classes/tools"/>
</path>
<path id="tools.test.classpath">
<path refid="tools.classpath"/>
<path refid="test.base.classpath"/>
<pathelement location="${build.dir}/classes/tools-test"/>
</path>
<target name="build-dict" depends="compile-tools, patch-dict">
<target name="build-dict" depends="compile, patch-dict">
<sequential>
<delete verbose="true">
<fileset dir="${resources.dir}/org/apache/lucene/analysis/ja/dict" includes="**/*"/>
</delete>
<!-- TODO: optimize the dictionary construction a bit so that you don't need 1G -->
<java fork="true" failonerror="true" maxmemory="1g" classname="org.apache.lucene.analysis.ja.util.DictionaryBuilder">
<classpath>
<path refid="tools.classpath"/>
</classpath>
<classpath refid="classpath"/>
<assertions>
<enable package="org.apache.lucene"/>
</assertions>
@ -103,34 +91,7 @@
</sequential>
</target>
<target name="compile-tools" depends="compile-core, common.compile-tools">
<compile
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
<classpath>
<path refid="tools.classpath"/>
</classpath>
</compile>
</target>
<target name="compile-tools-tests" depends="compile-tools">
<compile
srcdir="src/tools/test"
destdir="${build.dir}/classes/tools-test">
<classpath>
<path refid="tools.test.classpath"/>
<pathelement path="src/tools/test"/>
</classpath>
</compile>
</target>
<target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
<test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
</target>
<target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
<!-- TODO: not until we properly make 'test-tools' work with clover etc
<target name="test" depends="module-build.test, test-tools"/> -->
<target name="compile-test" depends="module-build.compile-test"/>
<target name="regenerate" depends="build-dict"/>

View File

@ -22,7 +22,6 @@ import java.io.InputStream;
import java.io.IOException;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -43,20 +42,9 @@ public final class TokenInfoDictionary extends BinaryDictionary {
*/
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
super(resourceScheme, resourcePath);
InputStream is = null;
FST<Long> fst;
boolean success = false;
try {
is = getResource(FST_FILENAME_SUFFIX);
is = new BufferedInputStream(is);
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
success = true;
} finally {
if (success) {
IOUtils.close(is);
} else {
IOUtils.closeWhileHandlingException(is);
}
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);

View File

@ -18,13 +18,13 @@ package org.apache.lucene.analysis.ja.util;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import org.apache.lucene.codecs.CodecUtil;
@ -37,14 +37,14 @@ import org.apache.lucene.analysis.ja.dict.BinaryDictionary;
public abstract class BinaryDictionaryWriter {
private final static int ID_LIMIT = 8192;
protected final Class<? extends BinaryDictionary> implClazz;
private final Class<? extends BinaryDictionary> implClazz;
protected ByteBuffer buffer;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
private final ArrayList<String> posDict = new ArrayList<>();
public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
this.implClazz = implClazz;
buffer = ByteBuffer.allocate(size);
}
@ -199,7 +199,7 @@ public abstract class BinaryDictionaryWriter {
}
private String toKatakana(String s) {
char text[] = new char[s.length()];
char[] text = new char[s.length()];
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (ch > 0x3040 && ch < 0x3097) {
@ -211,7 +211,7 @@ public abstract class BinaryDictionaryWriter {
return new String(text);
}
public static int sharedPrefix(String left, String right) {
private static int sharedPrefix(String left, String right) {
int len = left.length() < right.length() ? left.length() : right.length();
for (int i = 0; i < len; i++)
if (left.charAt(i) != right.charAt(i))
@ -219,7 +219,7 @@ public abstract class BinaryDictionaryWriter {
return len;
}
public void addMapping(int sourceId, int wordId) {
void addMapping(int sourceId, int wordId) {
if (wordId <= lastWordId) {
throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
}
@ -241,8 +241,8 @@ public abstract class BinaryDictionaryWriter {
lastWordId = wordId;
}
protected final String getBaseFileName(String baseDir) {
return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
final String getBaseFileName() {
return implClazz.getName().replace('.', '/');
}
/**
@ -251,20 +251,19 @@ public abstract class BinaryDictionaryWriter {
* [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
* @throws IOException if an I/O error occurs writing the dictionary files
*/
public void write(String baseDir) throws IOException {
final String baseName = getBaseFileName(baseDir);
writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
public void write(Path baseDir) throws IOException {
final String baseName = getBaseFileName();
writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX));
writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX));
writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX));
}
// TODO: maybe this int[] should instead be the output to the FST...
protected void writeTargetMap(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
private void writeTargetMap(Path path) throws IOException {
Files.createDirectories(path.getParent());
try (OutputStream os = Files.newOutputStream(path);
OutputStream bos = new BufferedOutputStream(os)) {
final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
final int numSourceIds = lastSourceId + 1;
@ -285,17 +284,14 @@ public abstract class BinaryDictionaryWriter {
if (sourceId != numSourceIds) {
throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
}
} finally {
os.close();
}
}
protected void writePosDict(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
private void writePosDict(Path path) throws IOException {
Files.createDirectories(path.getParent());
try (OutputStream os = Files.newOutputStream(path);
OutputStream bos = new BufferedOutputStream(os)) {
final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(posDict.size());
for (String s : posDict) {
@ -304,7 +300,7 @@ public abstract class BinaryDictionaryWriter {
out.writeByte((byte)0);
out.writeByte((byte)0);
} else {
String data[] = CSVUtil.parse(s);
String[] data = CSVUtil.parse(s);
if (data.length != 3) {
throw new IllegalArgumentException("Malformed pos/inflection: " + s + "; expected 3 characters");
}
@ -313,25 +309,21 @@ public abstract class BinaryDictionaryWriter {
out.writeString(data[2]);
}
}
} finally {
os.close();
}
}
protected void writeDictionary(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
final FileOutputStream os = new FileOutputStream(filename);
try {
final DataOutput out = new OutputStreamDataOutput(os);
private void writeDictionary(Path path) throws IOException {
Files.createDirectories(path.getParent());
try (OutputStream os = Files.newOutputStream(path);
OutputStream bos = new BufferedOutputStream(os)) {
final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(buffer.position());
final WritableByteChannel channel = Channels.newChannel(os);
final WritableByteChannel channel = Channels.newChannel(bos);
// Write Buffer
buffer.flip(); // set position to 0, set limit to current position
channel.write(buffer);
assert buffer.remaining() == 0L;
} finally {
os.close();
}
}
}

View File

@ -18,10 +18,10 @@ package org.apache.lucene.analysis.ja.util;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
@ -40,7 +40,7 @@ public final class CharacterDefinitionWriter {
/**
* Constructor for building. TODO: remove write access
*/
public CharacterDefinitionWriter() {
CharacterDefinitionWriter() {
Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
}
@ -51,7 +51,7 @@ public final class CharacterDefinitionWriter {
* code point
* @param characterClassName character class name
*/
public void putCharacterCategory(int codePoint, String characterClassName) {
void putCharacterCategory(int codePoint, String characterClassName) {
characterClassName = characterClassName.split(" ")[0]; // use first
// category
// class
@ -63,20 +63,17 @@ public final class CharacterDefinitionWriter {
characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
}
public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
invokeMap[characterClass] = invoke == 1;
groupMap[characterClass] = group == 1;
// TODO: length def ignored
}
public void write(String baseDir) throws IOException {
String filename = baseDir + File.separator +
CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX;
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
public void write(Path baseDir) throws IOException {
Path path = baseDir.resolve(CharacterDefinition.class.getName().replace('.', '/') + CharacterDefinition.FILENAME_SUFFIX);
Files.createDirectories(path.getParent());
try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))){
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
@ -87,8 +84,6 @@ public final class CharacterDefinitionWriter {
);
out.writeByte(b);
}
} finally {
os.close();
}
}

View File

@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
public class ConnectionCostsBuilder {
private ConnectionCostsBuilder() {
}
public static ConnectionCostsWriter build(Path path) throws IOException {
try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII);
LineNumberReader lineReader = new LineNumberReader(reader)) {
String line = lineReader.readLine();
String[] dimensions = line.split("\\s+");
assert dimensions.length == 2;
int forwardSize = Integer.parseInt(dimensions[0]);
int backwardSize = Integer.parseInt(dimensions[1]);
assert forwardSize > 0 && backwardSize > 0;
ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
while ((line = lineReader.readLine()) != null) {
String[] fields = line.split("\\s+");
assert fields.length == 3;
int forwardId = Integer.parseInt(fields[0]);
int backwardId = Integer.parseInt(fields[1]);
int cost = Integer.parseInt(fields[2]);
costs.add(forwardId, backwardId, cost);
}
return costs;
}
}
}

View File

@ -18,10 +18,10 @@ package org.apache.lucene.analysis.ja.util;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
@ -37,7 +37,7 @@ public final class ConnectionCostsWriter {
/**
* Constructor for building. TODO: remove write access
*/
public ConnectionCostsWriter(int forwardSize, int backwardSize) {
ConnectionCostsWriter(int forwardSize, int backwardSize) {
this.forwardSize = forwardSize;
this.backwardSize = backwardSize;
this.costs = new short[backwardSize][forwardSize];
@ -47,14 +47,12 @@ public final class ConnectionCostsWriter {
this.costs[backwardId][forwardId] = (short)cost;
}
public void write(String baseDir) throws IOException {
String filename = baseDir + File.separator +
ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX;
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
public void write(Path baseDir) throws IOException {
Files.createDirectories(baseDir);
String fileName = ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX;
try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName));
OutputStream bos = new BufferedOutputStream(os)) {
final DataOutput out = new OutputStreamDataOutput(bos);
CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
out.writeVInt(forwardSize);
out.writeVInt(backwardSize);
@ -62,14 +60,12 @@ public final class ConnectionCostsWriter {
assert costs.length == backwardSize;
for (short[] a : costs) {
assert a.length == forwardSize;
for (int i = 0; i < a.length; i++) {
int delta = (int)a[i] - last;
for (short cost : a) {
int delta = (int) cost - last;
out.writeZInt(delta);
last = a[i];
last = cost;
}
}
} finally {
os.close();
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
public class DictionaryBuilder {
public enum DictionaryFormat { IPADIC, UNIDIC }
private DictionaryBuilder() {
}
public static void build(DictionaryFormat format, Path inputDir, Path outputDir, String encoding, boolean normalizeEntry) throws IOException {
new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry)
.build(inputDir)
.write(outputDir);
new UnknownDictionaryBuilder(encoding)
.build(inputDir)
.write(outputDir);
ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
.write(outputDir);
}
public static void main(String[] args) throws IOException {
DictionaryFormat format = DictionaryFormat.valueOf(args[0].toUpperCase());
String inputDirName = args[1];
String outputDirName = args[2];
String inputEncoding = args[3];
boolean normalizeEntries = Boolean.parseBoolean(args[4]);
DictionaryBuilder.build(format, Paths.get(inputDirName), Paths.get(outputDirName), inputEncoding, normalizeEntries);
}
}

View File

@ -16,22 +16,18 @@
*/
package org.apache.lucene.analysis.ja.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRefBuilder;
@ -42,89 +38,63 @@ import org.apache.lucene.util.fst.PositiveIntOutputs;
/**
*/
public class TokenInfoDictionaryBuilder {
private final String encoding;
private final Normalizer.Form normalForm;
private final DictionaryFormat format;
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
private int offset = 0;
private String encoding = "euc-jp";
private Normalizer.Form normalForm;
private DictionaryFormat format = DictionaryFormat.IPADIC;
public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
this.format = format;
this.encoding = encoding;
this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
}
public TokenInfoDictionaryWriter build(String dirname) throws IOException {
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".csv");
}
};
ArrayList<File> csvFiles = new ArrayList<>();
for (File file : new File(dirname).listFiles(filter)) {
csvFiles.add(file);
public TokenInfoDictionaryWriter build(Path dir) throws IOException {
try (Stream<Path> files = Files.list(dir)) {
List<Path> csvFiles = files
.filter(path -> path.getFileName().toString().endsWith(".csv"))
.sorted()
.collect(Collectors.toList());
return buildDictionary(csvFiles);
}
Collections.sort(csvFiles);
return buildDictionary(csvFiles);
}
public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
Charset cs = Charset.forName(encoding);
// all lines in the file
System.out.println(" parse...");
List<String[]> lines = new ArrayList<>(400000);
for (File file : csvFiles){
FileInputStream inputStream = new FileInputStream(file);
Charset cs = Charset.forName(encoding);
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
BufferedReader reader = new BufferedReader(streamReader);
String line = null;
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
for (Path path : csvFiles) {
try (BufferedReader reader = Files.newBufferedReader(path, cs)) {
String line;
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
if(entry.length < 13) {
throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
}
String[] formatted = formatEntry(entry);
lines.add(formatted);
// NFKC normalize dictionary entry
if (normalForm != null) {
if (Normalizer.isNormalized(entry[0], normalForm)){
continue;
if (entry.length < 13) {
throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
}
String[] normalizedEntry = new String[entry.length];
for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
lines.add(formatEntry(entry));
if (normalForm != null) {
if (Normalizer.isNormalized(entry[0], normalForm)) {
continue;
}
String[] normalizedEntry = new String[entry.length];
for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
}
lines.add(formatEntry(normalizedEntry));
}
formatted = formatEntry(normalizedEntry);
lines.add(formatted);
}
}
}
System.out.println(" sort...");
// sort by term: we sorted the files already and use a stable sort.
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {
return left[0].compareTo(right[0]);
}
});
System.out.println(" encode...");
lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15, false);
@ -132,13 +102,12 @@ public class TokenInfoDictionaryBuilder {
long ord = -1; // first ord will be 0
String lastValue = null;
// build tokeninfo dictionary
// build token info dictionary
for (String[] entry : lines) {
int next = dictionary.put(entry);
if(next == offset){
System.out.println("Failed to process line: " + Arrays.toString(entry));
continue;
throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
}
String token = entry[0];
@ -153,16 +122,10 @@ public class TokenInfoDictionaryBuilder {
}
fstBuilder.add(scratch.get(), ord);
}
dictionary.addMapping((int)ord, offset);
dictionary.addMapping((int) ord, offset);
offset = next;
}
final FST<Long> fst = fstBuilder.finish();
System.out.print(" " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes... ");
dictionary.setFST(fst);
System.out.println(" done");
dictionary.setFST(fstBuilder.finish());
return dictionary;
}
@ -191,7 +154,7 @@ public class TokenInfoDictionaryBuilder {
* 13 - surface reading
*/
public String[] formatEntry(String[] features) {
private String[] formatEntry(String[] features) {
if (this.format == DictionaryFormat.IPADIC) {
return features;
} else {

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.ja.util;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Objects;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.util.fst.FST;
@ -28,23 +28,23 @@ import org.apache.lucene.util.fst.FST;
public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
private FST<Long> fst;
public TokenInfoDictionaryWriter(int size) {
TokenInfoDictionaryWriter(int size) {
super(TokenInfoDictionary.class, size);
}
public void setFST(FST<Long> fst) {
Objects.requireNonNull(fst, "dictionary must not be empty");
this.fst = fst;
}
@Override
public void write(String baseDir) throws IOException {
public void write(Path baseDir) throws IOException {
super.write(baseDir);
writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX);
writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX));
}
protected void writeFST(String filename) throws IOException {
Path p = Paths.get(filename);
Files.createDirectories(p.getParent());
fst.save(p);
private void writeFST(Path path) throws IOException {
Files.createDirectories(path.getParent());
fst.save(path);
}
}

View File

@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private final String encoding;
UnknownDictionaryBuilder(String encoding) {
this.encoding = encoding;
}
public UnknownDictionaryWriter build(Path dir) throws IOException {
UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def")); //Should be only one file
readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
return unkDictionary;
}
private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
return readDictionaryFile(path, encoding);
}
private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
List<String[]> lines = new ArrayList<>();
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
LineNumberReader lineReader = new LineNumberReader(reader)) {
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
String line;
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
lines.add(parsed);
}
}
lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));
for (String[] entry : lines) {
dictionary.put(entry);
}
return dictionary;
}
private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
LineNumberReader lineReader = new LineNumberReader(reader)) {
String line;
while ((line = lineReader.readLine()) != null) {
line = line.replaceAll("^\\s", "");
line = line.replaceAll("\\s*#.*", "");
line = line.replaceAll("\\s+", " ");
// Skip empty line or comment line
if (line.length() == 0) {
continue;
}
if (line.startsWith("0x")) { // Category mapping
String[] values = line.split(" ", 2); // Split only first space
if (!values[0].contains("..")) {
int cp = Integer.decode(values[0]);
dictionary.putCharacterCategory(cp, values[1]);
} else {
String[] codePoints = values[0].split("\\.\\.");
int cpFrom = Integer.decode(codePoints[0]);
int cpTo = Integer.decode(codePoints[1]);
for (int i = cpFrom; i <= cpTo; i++) {
dictionary.putCharacterCategory(i, values[1]);
}
}
} else { // Invoke definition
String[] values = line.split(" "); // Consecutive space is merged above
String characterClassName = values[0];
int invoke = Integer.parseInt(values[1]);
int group = Integer.parseInt(values[2]);
int length = Integer.parseInt(values[3]);
dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
}
}
}
}
}

View File

@ -18,6 +18,8 @@ package org.apache.lucene.analysis.ja.util;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
@ -58,7 +60,7 @@ public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
}
@Override
public void write(String baseDir) throws IOException {
public void write(Path baseDir) throws IOException {
super.write(baseDir);
characterDefinition.write(baseDir);
}

View File

@ -16,16 +16,76 @@
*/
package org.apache.lucene.analysis.ja.dict;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryBuilder;
import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryWriter;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
public class TestTokenInfoDictionary extends LuceneTestCase {
import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
/**
* Tests of TokenInfoDictionary build tools; run using ant test-tools
*/
public class TokenInfoDictionaryTest extends LuceneTestCase {
public void testPut() throws Exception {
TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
// "large" id
"一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
IntsRef wordIdRef = new IntsRefBuilder().get();
dict.lookupWordIds(0, wordIdRef);
int wordId = wordIdRef.ints[wordIdRef.offset];
assertEquals(5000, dict.getLeftId(wordId));
assertEquals(5000, dict.getRightId(wordId));
assertEquals(3, dict.getWordCost(wordId));
dict.lookupWordIds(1, wordIdRef);
wordId = wordIdRef.ints[wordIdRef.offset];
assertEquals(1, dict.getLeftId(wordId));
assertEquals(1, dict.getRightId(wordId));
assertEquals(2, dict.getWordCost(wordId));
}
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
Path dir = createTempDir();
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
for (String entry : entries) {
printer.println(entry);
}
}
TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder(DictionaryFormat.IPADIC, "utf-8", true);
TokenInfoDictionaryWriter writer = builder.build(dir);
writer.write(dir);
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
// We must also load the other files (in BinaryDictionary) from the correct path
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
}
public void testPutException() {
// too few columns
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
// left id != right id
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
// left id != right id
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
// id too large
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
}
/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
@ -38,17 +98,17 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
ConnectionCosts matrix = ConnectionCosts.getInstance();
FST<Long> fst = tid.getFST().getInternalFST();
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
InputOutput<Long> mapping;
IntsRefFSTEnum.InputOutput<Long> mapping;
IntsRef scratch = new IntsRef();
while ((mapping = fstEnum.next()) != null) {
numTerms++;
IntsRef input = mapping.input;
char chars[] = new char[input.length];
char[] chars = new char[input.length];
for (int i = 0; i < chars.length; i++) {
chars[i] = (char)input.ints[input.offset+i];
}
assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
Long output = mapping.output;
int sourceId = output.intValue();
// we walk in order, terms, sourceIds, and wordIds should always be increasing
@ -60,41 +120,41 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
int wordId = scratch.ints[scratch.offset+i];
assertTrue(wordId > lastWordId);
lastWordId = wordId;
String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
String inflectionForm = tid.getInflectionForm(wordId);
assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
if (inflectionForm != null) {
// check that it's actually an ipadic inflection form
assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
}
String inflectionType = tid.getInflectionType(wordId);
assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
if (inflectionType != null) {
// check that it's actually an ipadic inflection type
assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
}
int leftId = tid.getLeftId(wordId);
int rightId = tid.getRightId(wordId);
matrix.get(rightId, leftId);
tid.getWordCost(wordId);
String pos = tid.getPartOfSpeech(wordId);
assertNotNull(pos);
assertTrue(UnicodeUtil.validUTF16String(pos));
// check that it's actually an ipadic pos tag
assertNotNull(ToStringUtil.getPOSTranslation(pos));
String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
assertNotNull(pronunciation);
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
String reading = tid.getReading(wordId, chars, 0, chars.length);
assertNotNull(reading);
assertTrue(UnicodeUtil.validUTF16String(reading));
@ -104,4 +164,5 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
}

View File

@ -1,68 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
public class ConnectionCostsBuilder {
private ConnectionCostsBuilder() {
}
public static ConnectionCostsWriter build(String filename) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
Charset cs = StandardCharsets.US_ASCII;
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
LineNumberReader lineReader = new LineNumberReader(streamReader);
String line = lineReader.readLine();
String[] dimensions = line.split("\\s+");
assert dimensions.length == 2;
int forwardSize = Integer.parseInt(dimensions[0]);
int backwardSize = Integer.parseInt(dimensions[1]);
assert forwardSize > 0 && backwardSize > 0;
ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
while ((line = lineReader.readLine()) != null) {
String[] fields = line.split("\\s+");
assert fields.length == 3;
int forwardId = Integer.parseInt(fields[0]);
int backwardId = Integer.parseInt(fields[1]);
int cost = Integer.parseInt(fields[2]);
costs.add(forwardId, backwardId, cost);
}
return costs;
}
}

View File

@ -1,85 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
import java.io.File;
import java.io.IOException;
public class DictionaryBuilder {
public enum DictionaryFormat { IPADIC, UNIDIC };
private DictionaryBuilder() {
}
public static void build(DictionaryFormat format,
String inputDirname,
String outputDirname,
String encoding,
boolean normalizeEntry) throws IOException {
System.out.println("building tokeninfo dict...");
TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
tokenInfoDictionary.write(outputDirname);
tokenInfoDictionary = null;
tokenInfoBuilder = null;
System.out.println("done");
System.out.print("building unknown word dict...");
UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
unkDictionary.write(outputDirname);
unkDictionary = null;
unkBuilder = null;
System.out.println("done");
System.out.print("building connection costs...");
ConnectionCostsWriter connectionCosts
= ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
connectionCosts.write(outputDirname);
System.out.println("done");
}
public static void main(String[] args) throws IOException {
DictionaryFormat format;
if (args[0].equalsIgnoreCase("ipadic")) {
format = DictionaryFormat.IPADIC;
} else if (args[0].equalsIgnoreCase("unidic")) {
format = DictionaryFormat.UNIDIC;
} else {
System.err.println("Illegal format " + args[0] + " using unidic instead");
format = DictionaryFormat.IPADIC;
}
String inputDirname = args[1];
String outputDirname = args[2];
String inputEncoding = args[3];
boolean normalizeEntries = Boolean.parseBoolean(args[4]);
System.out.println("dictionary builder");
System.out.println("");
System.out.println("dictionary format: " + format);
System.out.println("input directory: " + inputDirname);
System.out.println("output directory: " + outputDirname);
System.out.println("input encoding: " + inputEncoding);
System.out.println("normalize entries: " + normalizeEntries);
System.out.println("");
DictionaryBuilder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
}
}

View File

@ -1,135 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private String encoding = "euc-jp";
public UnknownDictionaryBuilder(String encoding) {
this.encoding = encoding;
}
public UnknownDictionaryWriter build(String dirname) throws IOException {
UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
return unkDictionary;
}
public UnknownDictionaryWriter readDictionaryFile(String filename)
throws IOException {
return readDictionaryFile(filename, encoding);
}
public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
throws IOException {
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
FileInputStream inputStream = new FileInputStream(filename);
Charset cs = Charset.forName(encoding);
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
LineNumberReader lineReader = new LineNumberReader(streamReader);
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
List<String[]> lines = new ArrayList<>();
String line = null;
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
lines.add(parsed);
}
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {
int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
return leftId - rightId;
}
});
for (String[] entry : lines) {
dictionary.put(entry);
}
return dictionary;
}
public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
LineNumberReader lineReader = new LineNumberReader(streamReader);
String line = null;
while ((line = lineReader.readLine()) != null) {
line = line.replaceAll("^\\s", "");
line = line.replaceAll("\\s*#.*", "");
line = line.replaceAll("\\s+", " ");
// Skip empty line or comment line
if(line.length() == 0) {
continue;
}
if(line.startsWith("0x")) { // Category mapping
String[] values = line.split(" ", 2); // Split only first space
if(!values[0].contains("..")) {
int cp = Integer.decode(values[0]).intValue();
dictionary.putCharacterCategory(cp, values[1]);
} else {
String[] codePoints = values[0].split("\\.\\.");
int cpFrom = Integer.decode(codePoints[0]).intValue();
int cpTo = Integer.decode(codePoints[1]).intValue();
for(int i = cpFrom; i <= cpTo; i++){
dictionary.putCharacterCategory(i, values[1]);
}
}
} else { // Invoke definition
String[] values = line.split(" "); // Consecutive space is merged above
String characterClassName = values[0];
int invoke = Integer.parseInt(values[1]);
int group = Integer.parseInt(values[2]);
int length = Integer.parseInt(values[3]);
dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
}
}
}
}

View File

@ -1,85 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.dict;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryBuilder;
import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryWriter;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import static java.io.File.separatorChar;
import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
/**
* Tests of TokenInfoDictionary build tools; run using ant test-tools
*/
public class TokenInfoDictionaryTest extends LuceneTestCase {
public void testPut() throws Exception {
TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
// "large" id
"一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
IntsRef wordIdRef = new IntsRefBuilder().get();
dict.lookupWordIds(0, wordIdRef);
int wordId = wordIdRef.ints[wordIdRef.offset];
assertEquals(5000, dict.getLeftId(wordId));
assertEquals(5000, dict.getRightId(wordId));
assertEquals(3, dict.getWordCost(wordId));
dict.lookupWordIds(1, wordIdRef);
wordId = wordIdRef.ints[wordIdRef.offset];
assertEquals(1, dict.getLeftId(wordId));
assertEquals(1, dict.getRightId(wordId));
assertEquals(2, dict.getWordCost(wordId));
}
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
Path dir = createTempDir();
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
for (String entry : entries) {
printer.println(entry);
}
}
TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder(DictionaryFormat.IPADIC, "utf-8", true);
TokenInfoDictionaryWriter writer = builder.build(dir.toString());
writer.write(dir.toString());
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
// We must also load the other files (in BinaryDictionary) from the correct path
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
}
public void testPutException() throws Exception {
// too few columns
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
// left id != right id
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
// left id != right id
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
// id too large
expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
}
}