mirror of https://github.com/apache/lucene.git
LUCENE-10400: revise binary dictionaries' constructor in nori (#693)
This commit is contained in:
parent
f0d17e94d9
commit
58fa95deea
|
@ -77,7 +77,7 @@ API Changes
|
|||
* LUCENE-10368: IntTaxonomyFacets has been deprecated and is no longer a supported extension point
|
||||
for user-created faceting implementations. (Greg Miller)
|
||||
|
||||
* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji:
|
||||
* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji and Nori:
|
||||
ConnectionCosts, TokenInfoDictionary, and UnknownDictionary. Old constructors that take resource scheme and
|
||||
resource path in those classes are deprecated; These are replaced with the new constructors and planned to be
|
||||
removed in a future release. (Tomoko Uchida, Uwe Schindler, Mike Sokolov)
|
||||
|
|
|
@ -18,25 +18,23 @@ package org.apache.lucene.analysis.ko.dict;
|
|||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.EOFException;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.Channels;
|
||||
import java.nio.channels.ReadableByteChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.analysis.ko.POS;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/** Base class for a binary-encoded in-memory dictionary. */
|
||||
public abstract class BinaryDictionary implements Dictionary {
|
||||
|
||||
/** Used to specify where (dictionary) resources get loaded from. */
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
public enum ResourceScheme {
|
||||
CLASSPATH,
|
||||
FILE
|
||||
|
@ -51,75 +49,36 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
public static final String POSDICT_HEADER = "ko_dict_pos";
|
||||
public static final int VERSION = 1;
|
||||
|
||||
private final ResourceScheme resourceScheme;
|
||||
private final String resourcePath;
|
||||
private final ByteBuffer buffer;
|
||||
private final int[] targetMapOffsets, targetMap;
|
||||
private final POS.Tag[] posDict;
|
||||
|
||||
protected BinaryDictionary() throws IOException {
|
||||
this(ResourceScheme.CLASSPATH, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
|
||||
* scheme only, use this class's name as the path.
|
||||
*/
|
||||
protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath)
|
||||
protected BinaryDictionary(
|
||||
IOSupplier<InputStream> targetMapResource,
|
||||
IOSupplier<InputStream> posResource,
|
||||
IOSupplier<InputStream> dictResource)
|
||||
throws IOException {
|
||||
this.resourceScheme = resourceScheme;
|
||||
if (resourcePath == null) {
|
||||
if (resourceScheme != ResourceScheme.CLASSPATH) {
|
||||
throw new IllegalArgumentException(
|
||||
"resourcePath must be supplied with FILE resource scheme");
|
||||
}
|
||||
this.resourcePath = getClass().getSimpleName();
|
||||
} else {
|
||||
if (resourceScheme == ResourceScheme.CLASSPATH && !resourcePath.startsWith("/")) {
|
||||
resourcePath = "/".concat(resourcePath);
|
||||
}
|
||||
this.resourcePath = resourcePath;
|
||||
}
|
||||
int[] targetMapOffsets, targetMap;
|
||||
ByteBuffer buffer;
|
||||
try (InputStream mapIS = new BufferedInputStream(getResource(TARGETMAP_FILENAME_SUFFIX));
|
||||
InputStream posIS = new BufferedInputStream(getResource(POSDICT_FILENAME_SUFFIX));
|
||||
// no buffering here, as we load in one large buffer
|
||||
InputStream dictIS = getResource(DICT_FILENAME_SUFFIX)) {
|
||||
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
|
||||
DataInput in = new InputStreamDataInput(mapIS);
|
||||
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
|
||||
targetMap = new int[in.readVInt()];
|
||||
targetMapOffsets = new int[in.readVInt()];
|
||||
int accum = 0, sourceId = 0;
|
||||
for (int ofs = 0; ofs < targetMap.length; ofs++) {
|
||||
final int val = in.readVInt();
|
||||
if ((val & 0x01) != 0) {
|
||||
targetMapOffsets[sourceId] = ofs;
|
||||
sourceId++;
|
||||
}
|
||||
accum += val >>> 1;
|
||||
targetMap[ofs] = accum;
|
||||
}
|
||||
if (sourceId + 1 != targetMapOffsets.length)
|
||||
throw new IOException(
|
||||
"targetMap file format broken; targetMap.length="
|
||||
+ targetMap.length
|
||||
+ ", targetMapOffsets.length="
|
||||
+ targetMapOffsets.length
|
||||
+ ", sourceId="
|
||||
+ sourceId);
|
||||
targetMapOffsets[sourceId] = targetMap.length;
|
||||
this.targetMap = new int[in.readVInt()];
|
||||
this.targetMapOffsets = new int[in.readVInt()];
|
||||
populateTargetMap(in, this.targetMap, this.targetMapOffsets);
|
||||
}
|
||||
|
||||
in = new InputStreamDataInput(posIS);
|
||||
try (InputStream posIS = new BufferedInputStream(posResource.get())) {
|
||||
DataInput in = new InputStreamDataInput(posIS);
|
||||
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
|
||||
int posSize = in.readVInt();
|
||||
posDict = new POS.Tag[posSize];
|
||||
this.posDict = new POS.Tag[posSize];
|
||||
for (int j = 0; j < posSize; j++) {
|
||||
posDict[j] = POS.resolveTag(in.readByte());
|
||||
}
|
||||
}
|
||||
|
||||
in = new InputStreamDataInput(dictIS);
|
||||
// no buffering here, as we load in one large buffer
|
||||
try (InputStream dictIS = dictResource.get()) {
|
||||
DataInput in = new InputStreamDataInput(dictIS);
|
||||
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
|
||||
final int size = in.readVInt();
|
||||
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
|
||||
|
@ -128,48 +87,31 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
if (read != size) {
|
||||
throw new EOFException("Cannot read whole dictionary");
|
||||
}
|
||||
buffer = tmpBuffer.asReadOnlyBuffer();
|
||||
}
|
||||
|
||||
this.targetMap = targetMap;
|
||||
this.targetMapOffsets = targetMapOffsets;
|
||||
this.buffer = buffer;
|
||||
}
|
||||
|
||||
protected final InputStream getResource(String suffix) throws IOException {
|
||||
switch (resourceScheme) {
|
||||
case CLASSPATH:
|
||||
return getClassResource(resourcePath + suffix);
|
||||
case FILE:
|
||||
return Files.newInputStream(Paths.get(resourcePath + suffix));
|
||||
default:
|
||||
throw new IllegalStateException("unknown resource scheme " + resourceScheme);
|
||||
this.buffer = tmpBuffer.asReadOnlyBuffer();
|
||||
}
|
||||
}
|
||||
|
||||
public static InputStream getResource(ResourceScheme scheme, String path) throws IOException {
|
||||
switch (scheme) {
|
||||
case CLASSPATH:
|
||||
return getClassResource(path);
|
||||
case FILE:
|
||||
return Files.newInputStream(Paths.get(path));
|
||||
default:
|
||||
throw new IllegalStateException("unknown resource scheme " + scheme);
|
||||
private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
|
||||
throws IOException {
|
||||
int accum = 0, sourceId = 0;
|
||||
for (int ofs = 0; ofs < targetMap.length; ofs++) {
|
||||
final int val = in.readVInt();
|
||||
if ((val & 0x01) != 0) {
|
||||
targetMapOffsets[sourceId] = ofs;
|
||||
sourceId++;
|
||||
}
|
||||
accum += val >>> 1;
|
||||
targetMap[ofs] = accum;
|
||||
}
|
||||
}
|
||||
|
||||
// util, reused by ConnectionCosts and CharacterDefinition
|
||||
public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
|
||||
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
|
||||
if (is == null) {
|
||||
throw new FileNotFoundException(
|
||||
"Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
|
||||
private static InputStream getClassResource(String path) throws IOException {
|
||||
return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path);
|
||||
if (sourceId + 1 != targetMapOffsets.length)
|
||||
throw new IOException(
|
||||
"targetMap file format broken; targetMap.length="
|
||||
+ targetMap.length
|
||||
+ ", targetMapOffsets.length="
|
||||
+ targetMapOffsets.length
|
||||
+ ", sourceId="
|
||||
+ sourceId);
|
||||
targetMapOffsets[sourceId] = targetMap.length;
|
||||
}
|
||||
|
||||
public void lookupWordIds(int sourceId, IntsRef ref) {
|
||||
|
|
|
@ -73,11 +73,7 @@ public final class CharacterDefinition {
|
|||
public static final byte HANJANUMERIC = (byte) CharacterClass.HANJANUMERIC.ordinal();
|
||||
|
||||
private CharacterDefinition() throws IOException {
|
||||
InputStream is = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
|
||||
is = new BufferedInputStream(is);
|
||||
try (InputStream is = new BufferedInputStream(getClassResource())) {
|
||||
final DataInput in = new InputStreamDataInput(is);
|
||||
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
|
||||
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
|
||||
|
@ -86,16 +82,15 @@ public final class CharacterDefinition {
|
|||
invokeMap[i] = (b & 0x01) != 0;
|
||||
groupMap[i] = (b & 0x02) != 0;
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(is);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(is);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static InputStream getClassResource() throws IOException {
|
||||
final String resourcePath = CharacterDefinition.class.getSimpleName() + FILENAME_SUFFIX;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public byte getCharacterClass(char c) {
|
||||
return characterCategoryMap[c];
|
||||
}
|
||||
|
|
|
@ -20,9 +20,14 @@ import java.io.BufferedInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** n-gram connection cost data */
|
||||
public final class ConnectionCosts {
|
||||
|
@ -38,12 +43,32 @@ public final class ConnectionCosts {
|
|||
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param resourcePath - where to load resources from, without the ".dat" suffix
|
||||
*/
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
@SuppressWarnings("removal")
|
||||
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String resourcePath)
|
||||
throws IOException {
|
||||
try (InputStream is =
|
||||
new BufferedInputStream(
|
||||
BinaryDictionary.getResource(
|
||||
scheme, "/" + resourcePath.replace('.', '/') + FILENAME_SUFFIX))) {
|
||||
this(
|
||||
scheme == BinaryDictionary.ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + FILENAME_SUFFIX))
|
||||
: ConnectionCosts::getClassResource);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link ConnectionCosts} from an external resource path.
|
||||
*
|
||||
* @param connectionCostsFile where to load connection costs resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public ConnectionCosts(Path connectionCostsFile) throws IOException {
|
||||
this(() -> Files.newInputStream(connectionCostsFile));
|
||||
}
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(ConnectionCosts::getClassResource);
|
||||
}
|
||||
|
||||
private ConnectionCosts(IOSupplier<InputStream> connectionCostResource) throws IOException {
|
||||
try (InputStream is = new BufferedInputStream(connectionCostResource.get())) {
|
||||
final DataInput in = new InputStreamDataInput(is);
|
||||
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
|
||||
this.forwardSize = in.readVInt();
|
||||
|
@ -63,8 +88,10 @@ public final class ConnectionCosts {
|
|||
}
|
||||
}
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
|
||||
private static InputStream getClassResource() throws IOException {
|
||||
final String resourcePath = ConnectionCosts.class.getSimpleName() + FILENAME_SUFFIX;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public int get(int forwardId, int backwardId) {
|
||||
|
|
|
@ -19,8 +19,13 @@ package org.apache.lucene.analysis.ko.dict;
|
|||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
||||
|
@ -35,7 +40,11 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
|||
private final TokenInfoFST fst;
|
||||
|
||||
private TokenInfoDictionary() throws IOException {
|
||||
this(ResourceScheme.CLASSPATH, null);
|
||||
this(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
() -> getClassResource(DICT_FILENAME_SUFFIX),
|
||||
() -> getClassResource(FST_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -43,17 +52,64 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
|||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
|
||||
* scheme only, use this class's name as the path.
|
||||
*/
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
@SuppressWarnings("removal")
|
||||
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath)
|
||||
throws IOException {
|
||||
super(resourceScheme, resourcePath);
|
||||
this(
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(DICT_FILENAME_SUFFIX),
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(FST_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link TokenInfoDictionary} from an external resource path.
|
||||
*
|
||||
* @param targetMapFile where to load target map resource
|
||||
* @param posDictFile where to load POS dictionary resource
|
||||
* @param dictFile where to load dictionary entries resource
|
||||
* @param fstFile where to load encoded FST data resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile)
|
||||
throws IOException {
|
||||
this(
|
||||
() -> Files.newInputStream(targetMapFile),
|
||||
() -> Files.newInputStream(posDictFile),
|
||||
() -> Files.newInputStream(dictFile),
|
||||
() -> Files.newInputStream(fstFile));
|
||||
}
|
||||
|
||||
private TokenInfoDictionary(
|
||||
IOSupplier<InputStream> targetMapResource,
|
||||
IOSupplier<InputStream> posResource,
|
||||
IOSupplier<InputStream> dictResource,
|
||||
IOSupplier<InputStream> fstResource)
|
||||
throws IOException {
|
||||
super(targetMapResource, posResource, dictResource);
|
||||
FST<Long> fst;
|
||||
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
|
||||
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
||||
DataInput in = new InputStreamDataInput(is);
|
||||
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
|
||||
}
|
||||
this.fst = new TokenInfoFST(fst);
|
||||
}
|
||||
|
||||
private static InputStream getClassResource(String suffix) throws IOException {
|
||||
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public TokenInfoFST getFST() {
|
||||
return fst;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,11 @@
|
|||
package org.apache.lucene.analysis.ko.dict;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Dictionary for unknown-word handling. */
|
||||
public final class UnknownDictionary extends BinaryDictionary {
|
||||
|
@ -27,12 +32,47 @@ public final class UnknownDictionary extends BinaryDictionary {
|
|||
* @param resourcePath where to load resources from; a path, including the file base name without
|
||||
* extension; this is used to match multiple files with the same base name.
|
||||
*/
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
@SuppressWarnings("removal")
|
||||
public UnknownDictionary(ResourceScheme scheme, String resourcePath) throws IOException {
|
||||
super(scheme, resourcePath);
|
||||
super(
|
||||
scheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
scheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
scheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(DICT_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link UnknownDictionary} from an external resource path.
|
||||
*
|
||||
* @param targetMapFile where to load target map resource
|
||||
* @param posDictFile where to load POS dictionary resource
|
||||
* @param dictFile where to load dictionary entries resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException {
|
||||
super(
|
||||
() -> Files.newInputStream(targetMapFile),
|
||||
() -> Files.newInputStream(posDictFile),
|
||||
() -> Files.newInputStream(dictFile));
|
||||
}
|
||||
|
||||
private UnknownDictionary() throws IOException {
|
||||
super();
|
||||
super(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
() -> getClassResource(DICT_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
private static InputStream getClassResource(String suffix) throws IOException {
|
||||
final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
UnknownDictionary.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public CharacterDefinition getCharacterDefinition() {
|
||||
|
|
|
@ -473,6 +473,7 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
// Make sure loading custom dictionaries from classpath works:
|
||||
@SuppressWarnings("removal")
|
||||
public void testCustomDictionary() throws Exception {
|
||||
Tokenizer tokenizer =
|
||||
new KoreanTokenizer(
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ko.dict;
|
||||
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX;
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
|
||||
import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.junit.Before;
|
||||
|
||||
public class TestExternalDictionary extends LuceneTestCase {
|
||||
|
||||
private Path dir;
|
||||
|
||||
@Override
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = createTempDir("systemDict");
|
||||
try (BufferedWriter writer =
|
||||
Files.newBufferedWriter(dir.resolve("unk.def"), StandardCharsets.UTF_8)) {
|
||||
writer.write("DEFAULT,1798,3559,3677,SY,*,*,*,*,*,*,*");
|
||||
writer.newLine();
|
||||
writer.write("SPACE,1795,3556,1065,SP,*,*,*,*,*,*,*");
|
||||
writer.newLine();
|
||||
}
|
||||
try (BufferedWriter writer =
|
||||
Files.newBufferedWriter(dir.resolve("char.def"), StandardCharsets.UTF_8)) {
|
||||
writer.write("0x0021..0x002F SYMBOL");
|
||||
writer.newLine();
|
||||
writer.write("0x0030..0x0039 NUMERIC");
|
||||
writer.newLine();
|
||||
}
|
||||
try (BufferedWriter writer =
|
||||
Files.newBufferedWriter(dir.resolve("matrix.def"), StandardCharsets.UTF_8)) {
|
||||
writer.write("3 3");
|
||||
writer.newLine();
|
||||
writer.write("1 1 0");
|
||||
writer.newLine();
|
||||
writer.write("1 2 0");
|
||||
writer.newLine();
|
||||
}
|
||||
try (BufferedWriter writer =
|
||||
Files.newBufferedWriter(dir.resolve("noun.csv"), StandardCharsets.UTF_8)) {
|
||||
writer.write("명사,1,1,2,NNG,*,*,*,*,*,*,*");
|
||||
writer.newLine();
|
||||
writer.write("일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
|
||||
writer.newLine();
|
||||
}
|
||||
DictionaryBuilder.build(dir, dir, "utf-8", true);
|
||||
}
|
||||
|
||||
public void testLoadExternalTokenInfoDictionary() throws Exception {
|
||||
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
|
||||
TokenInfoDictionary dict =
|
||||
new TokenInfoDictionary(
|
||||
dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + FST_FILENAME_SUFFIX));
|
||||
assertNotNull(dict.getFST());
|
||||
}
|
||||
|
||||
public void testLoadExternalUnknownDictionary() throws Exception {
|
||||
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
|
||||
UnknownDictionary dict =
|
||||
new UnknownDictionary(
|
||||
dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX));
|
||||
assertNotNull(dict.getCharacterDefinition());
|
||||
}
|
||||
|
||||
public void testLoadExternalConnectionCosts() throws Exception {
|
||||
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
|
||||
ConnectionCosts cc =
|
||||
new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
|
||||
assertEquals(0, cc.get(1, 1));
|
||||
}
|
||||
}
|
|
@ -16,7 +16,10 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.ko.dict;
|
||||
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX;
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
|
||||
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
|
||||
import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
|
@ -76,7 +79,11 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
|||
DictionaryBuilder.build(dir, dir, "utf-8", true);
|
||||
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
|
||||
// We must also load the other files (in BinaryDictionary) from the correct path
|
||||
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
|
||||
return new TokenInfoDictionary(
|
||||
dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX),
|
||||
dir.resolve(dictionaryPath + FST_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
public void testPutException() {
|
||||
|
|
Loading…
Reference in New Issue