mirror of https://github.com/apache/lucene.git
LUCENE-8904: enhance Nori DictionaryBuilder tool
This commit is contained in:
parent
6d79cc9e44
commit
2ac5fb668a
lucene/analysis/nori
build.xml
src
java/org/apache/lucene/analysis/ko/dict
tools
java/org/apache/lucene/analysis/ko/util
test/org/apache/lucene/analysis/ko/dict
|
@ -123,8 +123,8 @@
|
|||
</compile>
|
||||
</target>
|
||||
|
||||
<target name="test-tools" depends="compile-tools-tests">
|
||||
<test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
|
||||
<target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
|
||||
<test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
|
||||
</target>
|
||||
|
||||
<target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
|
||||
|
|
|
@ -24,6 +24,8 @@ import java.io.InputStream;
|
|||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.Channels;
|
||||
import java.nio.channels.ReadableByteChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.lucene.analysis.ko.POS;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
|
@ -36,6 +38,14 @@ import org.apache.lucene.util.IntsRef;
|
|||
* Base class for a binary-encoded in-memory dictionary.
|
||||
*/
|
||||
public abstract class BinaryDictionary implements Dictionary {
|
||||
|
||||
/**
|
||||
* Used to specify where (dictionary) resources get loaded from.
|
||||
*/
|
||||
public enum ResourceScheme {
|
||||
CLASSPATH, FILE
|
||||
}
|
||||
|
||||
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
|
||||
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
|
||||
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
|
||||
|
@ -45,11 +55,31 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
public static final String POSDICT_HEADER = "ko_dict_pos";
|
||||
public static final int VERSION = 1;
|
||||
|
||||
private final ResourceScheme resourceScheme;
|
||||
private final String resourcePath;
|
||||
private final ByteBuffer buffer;
|
||||
private final int[] targetMapOffsets, targetMap;
|
||||
private final POS.Tag[] posDict;
|
||||
|
||||
protected BinaryDictionary() throws IOException {
|
||||
this(ResourceScheme.CLASSPATH, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
|
||||
* this class's name as the path.
|
||||
*/
|
||||
protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
|
||||
this.resourceScheme = resourceScheme;
|
||||
if (resourcePath == null) {
|
||||
if (resourceScheme != ResourceScheme.CLASSPATH) {
|
||||
throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme");
|
||||
}
|
||||
this.resourcePath = getClass().getName().replace('.', '/');
|
||||
} else {
|
||||
this.resourcePath = resourcePath;
|
||||
}
|
||||
InputStream mapIS = null, dictIS = null, posIS = null;
|
||||
int[] targetMapOffsets = null, targetMap = null;
|
||||
ByteBuffer buffer = null;
|
||||
|
@ -72,7 +102,9 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
targetMap[ofs] = accum;
|
||||
}
|
||||
if (sourceId + 1 != targetMapOffsets.length)
|
||||
throw new IOException("targetMap file format broken");
|
||||
throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length
|
||||
+ ", targetMapOffsets.length=" + targetMapOffsets.length
|
||||
+ ", sourceId=" + sourceId);
|
||||
targetMapOffsets[sourceId] = targetMap.length;
|
||||
mapIS.close(); mapIS = null;
|
||||
|
||||
|
@ -103,9 +135,9 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(mapIS, dictIS);
|
||||
IOUtils.close(mapIS, posIS, dictIS);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(mapIS, dictIS);
|
||||
IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,14 +147,30 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
}
|
||||
|
||||
protected final InputStream getResource(String suffix) throws IOException {
|
||||
return getClassResource(getClass(), suffix);
|
||||
switch(resourceScheme) {
|
||||
case CLASSPATH:
|
||||
return getClassResource(resourcePath + suffix);
|
||||
case FILE:
|
||||
return Files.newInputStream(Paths.get(resourcePath + suffix));
|
||||
default:
|
||||
throw new IllegalStateException("unknown resource scheme " + resourceScheme);
|
||||
}
|
||||
}
|
||||
|
||||
// util, reused by ConnectionCosts and CharacterDefinition
|
||||
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
|
||||
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
|
||||
if (is == null)
|
||||
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
|
||||
if (is == null) {
|
||||
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
|
||||
private InputStream getClassResource(String path) throws IOException {
|
||||
final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
|
||||
if (is == null) {
|
||||
throw new FileNotFoundException("Not in classpath: " + path);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,11 +34,20 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
|||
public static final String FST_FILENAME_SUFFIX = "$fst.dat";
|
||||
|
||||
private final TokenInfoFST fst;
|
||||
|
||||
|
||||
private TokenInfoDictionary() throws IOException {
|
||||
super();
|
||||
this(ResourceScheme.CLASSPATH, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
|
||||
* this class's name as the path.
|
||||
*/
|
||||
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
|
||||
super(resourceScheme, resourcePath);
|
||||
InputStream is = null;
|
||||
FST<Long> fst = null;
|
||||
FST<Long> fst;
|
||||
boolean success = false;
|
||||
try {
|
||||
is = getResource(FST_FILENAME_SUFFIX);
|
||||
|
|
|
@ -38,6 +38,8 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.analysis.ko.dict.BinaryDictionary;
|
||||
|
||||
public abstract class BinaryDictionaryWriter {
|
||||
private final static int ID_LIMIT = 8192;
|
||||
|
||||
protected final Class<? extends BinaryDictionary> implClazz;
|
||||
protected ByteBuffer buffer;
|
||||
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
|
||||
|
@ -116,7 +118,7 @@ public abstract class BinaryDictionaryWriter {
|
|||
if (posType != POS.Type.MORPHEME && expression.length() > 0) {
|
||||
String[] exprTokens = expression.split("\\+");
|
||||
for (int i = 0; i < exprTokens.length; i++) {
|
||||
String[] tokenSplit = exprTokens[i].split("\\/");
|
||||
String[] tokenSplit = exprTokens[i].split("/");
|
||||
assert tokenSplit.length == 3;
|
||||
String surfaceForm = tokenSplit[0].trim();
|
||||
if (surfaceForm.isEmpty() == false) {
|
||||
|
@ -137,8 +139,12 @@ public abstract class BinaryDictionaryWriter {
|
|||
flags |= BinaryDictionary.HAS_READING;
|
||||
}
|
||||
|
||||
assert leftId < 8192; // there are still unused bits
|
||||
assert posType.ordinal() < 4;
|
||||
if (leftId >= ID_LIMIT) {
|
||||
throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId);
|
||||
}
|
||||
if (posType.ordinal() >= 4) {
|
||||
throw new IllegalArgumentException("posType.ordinal() >= " + 4 + ": " + posType.name());
|
||||
}
|
||||
buffer.putShort((short)(leftId << 2 | posType.ordinal()));
|
||||
buffer.putShort((short) (rightId << 2 | flags));
|
||||
buffer.putShort(wordCost);
|
||||
|
@ -178,16 +184,17 @@ public abstract class BinaryDictionaryWriter {
|
|||
}
|
||||
|
||||
public void addMapping(int sourceId, int wordId) {
|
||||
assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
|
||||
if (wordId <= lastWordId) {
|
||||
throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
|
||||
}
|
||||
|
||||
if (sourceId > lastSourceId) {
|
||||
assert sourceId > lastSourceId : "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId;
|
||||
targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
|
||||
for (int i = lastSourceId + 1; i <= sourceId; i++) {
|
||||
targetMapOffsets[i] = targetMapEndOffset;
|
||||
}
|
||||
} else {
|
||||
assert sourceId == lastSourceId;
|
||||
} else if (sourceId != lastSourceId) {
|
||||
throw new IllegalStateException("source ids not in increasing order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId);
|
||||
}
|
||||
|
||||
targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
|
||||
|
@ -236,7 +243,9 @@ public abstract class BinaryDictionaryWriter {
|
|||
}
|
||||
prev += delta;
|
||||
}
|
||||
assert sourceId == numSourceIds : "sourceId:"+sourceId+" != numSourceIds:"+numSourceIds;
|
||||
if (sourceId != numSourceIds) {
|
||||
throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
|
||||
}
|
||||
} finally {
|
||||
os.close();
|
||||
}
|
||||
|
@ -254,8 +263,10 @@ public abstract class BinaryDictionaryWriter {
|
|||
if (s == null) {
|
||||
out.writeByte((byte) POS.Tag.UNKNOWN.ordinal());
|
||||
} else {
|
||||
String data[] = CSVUtil.parse(s);
|
||||
assert data.length == 2 : "malformed pos/semanticClass: " + s;
|
||||
String[] data = CSVUtil.parse(s);
|
||||
if (data.length != 2) {
|
||||
throw new IllegalArgumentException("Malformed pos/inflection: " + s + "; expected 2 characters");
|
||||
}
|
||||
out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,8 +84,7 @@ public class TokenInfoDictionaryBuilder {
|
|||
String[] entry = CSVUtil.parse(line);
|
||||
|
||||
if(entry.length < 12) {
|
||||
System.out.println("Entry in CSV is not valid: " + line);
|
||||
continue;
|
||||
throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
|
||||
}
|
||||
|
||||
// NFKC normalize dictionary entry
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ko.dict;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryBuilder;
|
||||
import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryWriter;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import static java.io.File.separatorChar;
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
|
||||
|
||||
/**
|
||||
* Tests of TokenInfoDictionary build tools; run using ant test-tools
|
||||
*/
|
||||
public class TokenInfoDictionaryTest extends LuceneTestCase {
|
||||
|
||||
public void testPut() throws Exception {
|
||||
TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
|
||||
// "large" id
|
||||
"일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
|
||||
IntsRef wordIdRef = new IntsRefBuilder().get();
|
||||
|
||||
dict.lookupWordIds(0, wordIdRef);
|
||||
int wordId = wordIdRef.ints[wordIdRef.offset];
|
||||
assertEquals(1, dict.getLeftId(wordId));
|
||||
assertEquals(1, dict.getRightId(wordId));
|
||||
assertEquals(2, dict.getWordCost(wordId));
|
||||
|
||||
dict.lookupWordIds(1, wordIdRef);
|
||||
wordId = wordIdRef.ints[wordIdRef.offset];
|
||||
assertEquals(5000, dict.getLeftId(wordId));
|
||||
assertEquals(5000, dict.getRightId(wordId));
|
||||
assertEquals(3, dict.getWordCost(wordId));
|
||||
}
|
||||
|
||||
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
|
||||
Path dir = createTempDir();
|
||||
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
|
||||
PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
|
||||
for (String entry : entries) {
|
||||
printer.println(entry);
|
||||
}
|
||||
}
|
||||
TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder("utf-8", true);
|
||||
TokenInfoDictionaryWriter writer = builder.build(dir.toString());
|
||||
writer.write(dir.toString());
|
||||
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
|
||||
// We must also load the other files (in BinaryDictionary) from the correct path
|
||||
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
|
||||
}
|
||||
|
||||
public void testPutException() throws Exception {
|
||||
// too few columns
|
||||
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
|
||||
// id too large
|
||||
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
|
||||
}
|
||||
}
|
|
@ -27,20 +27,10 @@ public class UnknownDictionaryTest extends LuceneTestCase {
|
|||
@Test
|
||||
public void testPutCharacterCategory() {
|
||||
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
|
||||
|
||||
try{
|
||||
unkDic.putCharacterCategory(0, "DUMMY_NAME");
|
||||
fail();
|
||||
} catch(Exception e) {
|
||||
|
||||
}
|
||||
|
||||
try{
|
||||
unkDic.putCharacterCategory(-1, "HANGUL");
|
||||
fail();
|
||||
} catch(Exception e) {
|
||||
|
||||
}
|
||||
|
||||
expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME"));
|
||||
|
||||
expectThrows(Exception.class, () -> unkDic.putCharacterCategory(-1, "HANGUL"));
|
||||
|
||||
unkDic.putCharacterCategory(0, "DEFAULT");
|
||||
unkDic.putCharacterCategory(1, "GREEK");
|
||||
|
@ -52,12 +42,8 @@ public class UnknownDictionaryTest extends LuceneTestCase {
|
|||
@Test
|
||||
public void testPut() {
|
||||
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
|
||||
try{
|
||||
unkDic.put(CSVUtil.parse("HANGUL,1800,3562,UNKNOWN,*,*,*,*,*,*,*"));
|
||||
fail();
|
||||
} catch(Exception e){
|
||||
|
||||
}
|
||||
expectThrows(NumberFormatException.class, () ->
|
||||
unkDic.put(CSVUtil.parse("HANGUL,1800,3562,UNKNOWN,*,*,*,*,*,*,*")));
|
||||
|
||||
String entry1 = "ALPHA,1793,3533,795,SL,*,*,*,*,*,*,*";
|
||||
String entry2 = "HANGUL,1800,3562,10247,UNKNOWN,*,*,*,*,*,*,*";
|
||||
|
|
Loading…
Reference in New Issue