diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 939021919be..14c590e8781 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -77,7 +77,7 @@ API Changes * LUCENE-10368: IntTaxonomyFacets has been deprecated and is no longer a supported extension point for user-created faceting implementations. (Greg Miller) -* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji: +* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji and Nori: ConnectionCosts, TokenInfoDictionary, and UnknownDictionary. Old constructors that take resource scheme and resource path in those classes are deprecated; These are replaced with the new constructors and planned to be removed in a future release. (Tomoko Uchida, Uwe Schindler, Mike Sokolov) diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java index 5f44ae63886..aba6782182b 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java @@ -18,25 +18,23 @@ package org.apache.lucene.analysis.ko.dict; import java.io.BufferedInputStream; import java.io.EOFException; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; -import java.nio.file.Files; -import java.nio.file.Paths; import org.apache.lucene.analysis.ko.POS; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IntsRef; /** Base class for a binary-encoded in-memory dictionary. */ public abstract class BinaryDictionary implements Dictionary { /** Used to specify where (dictionary) resources get loaded from. */ + @Deprecated(forRemoval = true, since = "9.1") public enum ResourceScheme { CLASSPATH, FILE @@ -51,75 +49,36 @@ public abstract class BinaryDictionary implements Dictionary { public static final String POSDICT_HEADER = "ko_dict_pos"; public static final int VERSION = 1; - private final ResourceScheme resourceScheme; - private final String resourcePath; private final ByteBuffer buffer; private final int[] targetMapOffsets, targetMap; private final POS.Tag[] posDict; - protected BinaryDictionary() throws IOException { - this(ResourceScheme.CLASSPATH, null); - } - - /** - * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH). - * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH - * scheme only, use this class's name as the path. - */ - protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) + protected BinaryDictionary( + IOSupplier targetMapResource, + IOSupplier posResource, + IOSupplier dictResource) throws IOException { - this.resourceScheme = resourceScheme; - if (resourcePath == null) { - if (resourceScheme != ResourceScheme.CLASSPATH) { - throw new IllegalArgumentException( - "resourcePath must be supplied with FILE resource scheme"); - } - this.resourcePath = getClass().getSimpleName(); - } else { - if (resourceScheme == ResourceScheme.CLASSPATH && !resourcePath.startsWith("/")) { - resourcePath = "/".concat(resourcePath); - } - this.resourcePath = resourcePath; - } - int[] targetMapOffsets, targetMap; - ByteBuffer buffer; - try (InputStream mapIS = new BufferedInputStream(getResource(TARGETMAP_FILENAME_SUFFIX)); - InputStream posIS = new BufferedInputStream(getResource(POSDICT_FILENAME_SUFFIX)); - // no buffering here, as we load in one large buffer - InputStream dictIS = getResource(DICT_FILENAME_SUFFIX)) { + try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) { DataInput in = new InputStreamDataInput(mapIS); CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION); - targetMap = new int[in.readVInt()]; - targetMapOffsets = new int[in.readVInt()]; - int accum = 0, sourceId = 0; - for (int ofs = 0; ofs < targetMap.length; ofs++) { - final int val = in.readVInt(); - if ((val & 0x01) != 0) { - targetMapOffsets[sourceId] = ofs; - sourceId++; - } - accum += val >>> 1; - targetMap[ofs] = accum; - } - if (sourceId + 1 != targetMapOffsets.length) - throw new IOException( - "targetMap file format broken; targetMap.length=" - + targetMap.length - + ", targetMapOffsets.length=" - + targetMapOffsets.length - + ", sourceId=" - + sourceId); - targetMapOffsets[sourceId] = targetMap.length; + this.targetMap = new int[in.readVInt()]; + this.targetMapOffsets = new int[in.readVInt()]; + populateTargetMap(in, this.targetMap, this.targetMapOffsets); + } - in = new InputStreamDataInput(posIS); + try (InputStream posIS = new BufferedInputStream(posResource.get())) { + DataInput in = new InputStreamDataInput(posIS); CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION); int posSize = in.readVInt(); - posDict = new POS.Tag[posSize]; + this.posDict = new POS.Tag[posSize]; for (int j = 0; j < posSize; j++) { posDict[j] = POS.resolveTag(in.readByte()); } + } - in = new InputStreamDataInput(dictIS); + // no buffering here, as we load in one large buffer + try (InputStream dictIS = dictResource.get()) { + DataInput in = new InputStreamDataInput(dictIS); CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION); final int size = in.readVInt(); final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); @@ -128,48 +87,31 @@ public abstract class BinaryDictionary implements Dictionary { if (read != size) { throw new EOFException("Cannot read whole dictionary"); } - buffer = tmpBuffer.asReadOnlyBuffer(); - } - - this.targetMap = targetMap; - this.targetMapOffsets = targetMapOffsets; - this.buffer = buffer; - } - - protected final InputStream getResource(String suffix) throws IOException { - switch (resourceScheme) { - case CLASSPATH: - return getClassResource(resourcePath + suffix); - case FILE: - return Files.newInputStream(Paths.get(resourcePath + suffix)); - default: - throw new IllegalStateException("unknown resource scheme " + resourceScheme); + this.buffer = tmpBuffer.asReadOnlyBuffer(); } } - public static InputStream getResource(ResourceScheme scheme, String path) throws IOException { - switch (scheme) { - case CLASSPATH: - return getClassResource(path); - case FILE: - return Files.newInputStream(Paths.get(path)); - default: - throw new IllegalStateException("unknown resource scheme " + scheme); + private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets) + throws IOException { + int accum = 0, sourceId = 0; + for (int ofs = 0; ofs < targetMap.length; ofs++) { + final int val = in.readVInt(); + if ((val & 0x01) != 0) { + targetMapOffsets[sourceId] = ofs; + sourceId++; + } + accum += val >>> 1; + targetMap[ofs] = accum; } - } - - // util, reused by ConnectionCosts and CharacterDefinition - public static InputStream getClassResource(Class clazz, String suffix) throws IOException { - final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix); - if (is == null) { - throw new FileNotFoundException( - "Not in classpath: " + clazz.getName().replace('.', '/') + suffix); - } - return is; - } - - private static InputStream getClassResource(String path) throws IOException { - return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path); + if (sourceId + 1 != targetMapOffsets.length) + throw new IOException( + "targetMap file format broken; targetMap.length=" + + targetMap.length + + ", targetMapOffsets.length=" + + targetMapOffsets.length + + ", sourceId=" + + sourceId); + targetMapOffsets[sourceId] = targetMap.length; } public void lookupWordIds(int sourceId, IntsRef ref) { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java index 9ca0b814027..5e2e48932ba 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java @@ -73,11 +73,7 @@ public final class CharacterDefinition { public static final byte HANJANUMERIC = (byte) CharacterClass.HANJANUMERIC.ordinal(); private CharacterDefinition() throws IOException { - InputStream is = null; - boolean success = false; - try { - is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX); - is = new BufferedInputStream(is); + try (InputStream is = new BufferedInputStream(getClassResource())) { final DataInput in = new InputStreamDataInput(is); CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); in.readBytes(characterCategoryMap, 0, characterCategoryMap.length); @@ -86,16 +82,15 @@ public final class CharacterDefinition { invokeMap[i] = (b & 0x01) != 0; groupMap[i] = (b & 0x02) != 0; } - success = true; - } finally { - if (success) { - IOUtils.close(is); - } else { - IOUtils.closeWhileHandlingException(is); - } } } + private static InputStream getClassResource() throws IOException { + final String resourcePath = CharacterDefinition.class.getSimpleName() + FILENAME_SUFFIX; + return IOUtils.requireResourceNonNull( + CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath); + } + public byte getCharacterClass(char c) { return characterCategoryMap[c]; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java index 896c3795271..41dc4aa713a 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java @@ -20,9 +20,14 @@ import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOSupplier; +import org.apache.lucene.util.IOUtils; /** n-gram connection cost data */ public final class ConnectionCosts { @@ -38,12 +43,32 @@ public final class ConnectionCosts { * @param scheme - scheme for loading resources (FILE or CLASSPATH). * @param resourcePath - where to load resources from, without the ".dat" suffix */ + @Deprecated(forRemoval = true, since = "9.1") + @SuppressWarnings("removal") public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String resourcePath) throws IOException { - try (InputStream is = - new BufferedInputStream( - BinaryDictionary.getResource( - scheme, "/" + resourcePath.replace('.', '/') + FILENAME_SUFFIX))) { + this( + scheme == BinaryDictionary.ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + FILENAME_SUFFIX)) + : ConnectionCosts::getClassResource); + } + + /** + * Create a {@link ConnectionCosts} from an external resource path. + * + * @param connectionCostsFile where to load connection costs resource + * @throws IOException if resource was not found or broken + */ + public ConnectionCosts(Path connectionCostsFile) throws IOException { + this(() -> Files.newInputStream(connectionCostsFile)); + } + + private ConnectionCosts() throws IOException { + this(ConnectionCosts::getClassResource); + } + + private ConnectionCosts(IOSupplier connectionCostResource) throws IOException { + try (InputStream is = new BufferedInputStream(connectionCostResource.get())) { final DataInput in = new InputStreamDataInput(is); CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); this.forwardSize = in.readVInt(); @@ -63,8 +88,10 @@ public final class ConnectionCosts { } } - private ConnectionCosts() throws IOException { - this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName()); + private static InputStream getClassResource() throws IOException { + final String resourcePath = ConnectionCosts.class.getSimpleName() + FILENAME_SUFFIX; + return IOUtils.requireResourceNonNull( + ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath); } public int get(int forwardId, int backwardId) { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index 35d3d1b708d..c5182a5123b 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -19,8 +19,13 @@ package org.apache.lucene.analysis.ko.dict; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOSupplier; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -35,7 +40,11 @@ public final class TokenInfoDictionary extends BinaryDictionary { private final TokenInfoFST fst; private TokenInfoDictionary() throws IOException { - this(ResourceScheme.CLASSPATH, null); + this( + () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), + () -> getClassResource(POSDICT_FILENAME_SUFFIX), + () -> getClassResource(DICT_FILENAME_SUFFIX), + () -> getClassResource(FST_FILENAME_SUFFIX)); } /** @@ -43,17 +52,64 @@ public final class TokenInfoDictionary extends BinaryDictionary { * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH * scheme only, use this class's name as the path. */ + @Deprecated(forRemoval = true, since = "9.1") + @SuppressWarnings("removal") public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException { - super(resourceScheme, resourcePath); + this( + resourceScheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX)) + : () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), + resourceScheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX)) + : () -> getClassResource(POSDICT_FILENAME_SUFFIX), + resourceScheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX)) + : () -> getClassResource(DICT_FILENAME_SUFFIX), + resourceScheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX)) + : () -> getClassResource(FST_FILENAME_SUFFIX)); + } + + /** + * Create a {@link TokenInfoDictionary} from an external resource path. + * + * @param targetMapFile where to load target map resource + * @param posDictFile where to load POS dictionary resource + * @param dictFile where to load dictionary entries resource + * @param fstFile where to load encoded FST data resource + * @throws IOException if resource was not found or broken + */ + public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile) + throws IOException { + this( + () -> Files.newInputStream(targetMapFile), + () -> Files.newInputStream(posDictFile), + () -> Files.newInputStream(dictFile), + () -> Files.newInputStream(fstFile)); + } + + private TokenInfoDictionary( + IOSupplier targetMapResource, + IOSupplier posResource, + IOSupplier dictResource, + IOSupplier fstResource) + throws IOException { + super(targetMapResource, posResource, dictResource); FST fst; - try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) { + try (InputStream is = new BufferedInputStream(fstResource.get())) { DataInput in = new InputStreamDataInput(is); fst = new FST<>(in, in, PositiveIntOutputs.getSingleton()); } this.fst = new TokenInfoFST(fst); } + private static InputStream getClassResource(String suffix) throws IOException { + final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix; + return IOUtils.requireResourceNonNull( + TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath); + } + public TokenInfoFST getFST() { return fst; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java index a804890f34e..10eb1611b82 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java @@ -17,6 +17,11 @@ package org.apache.lucene.analysis.ko.dict; import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import org.apache.lucene.util.IOUtils; /** Dictionary for unknown-word handling. */ public final class UnknownDictionary extends BinaryDictionary { @@ -27,12 +32,47 @@ public final class UnknownDictionary extends BinaryDictionary { * @param resourcePath where to load resources from; a path, including the file base name without * extension; this is used to match multiple files with the same base name. */ + @Deprecated(forRemoval = true, since = "9.1") + @SuppressWarnings("removal") public UnknownDictionary(ResourceScheme scheme, String resourcePath) throws IOException { - super(scheme, resourcePath); + super( + scheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX)) + : () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), + scheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX)) + : () -> getClassResource(POSDICT_FILENAME_SUFFIX), + scheme == ResourceScheme.FILE + ? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX)) + : () -> getClassResource(DICT_FILENAME_SUFFIX)); + } + + /** + * Create a {@link UnknownDictionary} from an external resource path. + * + * @param targetMapFile where to load target map resource + * @param posDictFile where to load POS dictionary resource + * @param dictFile where to load dictionary entries resource + * @throws IOException if resource was not found or broken + */ + public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException { + super( + () -> Files.newInputStream(targetMapFile), + () -> Files.newInputStream(posDictFile), + () -> Files.newInputStream(dictFile)); } private UnknownDictionary() throws IOException { - super(); + super( + () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), + () -> getClassResource(POSDICT_FILENAME_SUFFIX), + () -> getClassResource(DICT_FILENAME_SUFFIX)); + } + + private static InputStream getClassResource(String suffix) throws IOException { + final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix; + return IOUtils.requireResourceNonNull( + UnknownDictionary.class.getResourceAsStream(resourcePath), resourcePath); } public CharacterDefinition getCharacterDefinition() { diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java index ec72e8a089b..315c876435c 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java @@ -473,6 +473,7 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase { } // Make sure loading custom dictionaries from classpath works: + @SuppressWarnings("removal") public void testCustomDictionary() throws Exception { Tokenizer tokenizer = new KoreanTokenizer( diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java new file mode 100644 index 00000000000..5f8edab8934 --- /dev/null +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX; + +import java.io.BufferedWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.lucene.analysis.ko.util.DictionaryBuilder; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Before; + +public class TestExternalDictionary extends LuceneTestCase { + + private Path dir; + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + dir = createTempDir("systemDict"); + try (BufferedWriter writer = + Files.newBufferedWriter(dir.resolve("unk.def"), StandardCharsets.UTF_8)) { + writer.write("DEFAULT,1798,3559,3677,SY,*,*,*,*,*,*,*"); + writer.newLine(); + writer.write("SPACE,1795,3556,1065,SP,*,*,*,*,*,*,*"); + writer.newLine(); + } + try (BufferedWriter writer = + Files.newBufferedWriter(dir.resolve("char.def"), StandardCharsets.UTF_8)) { + writer.write("0x0021..0x002F SYMBOL"); + writer.newLine(); + writer.write("0x0030..0x0039 NUMERIC"); + writer.newLine(); + } + try (BufferedWriter writer = + Files.newBufferedWriter(dir.resolve("matrix.def"), StandardCharsets.UTF_8)) { + writer.write("3 3"); + writer.newLine(); + writer.write("1 1 0"); + writer.newLine(); + writer.write("1 2 0"); + writer.newLine(); + } + try (BufferedWriter writer = + Files.newBufferedWriter(dir.resolve("noun.csv"), StandardCharsets.UTF_8)) { + writer.write("명사,1,1,2,NNG,*,*,*,*,*,*,*"); + writer.newLine(); + writer.write("일반,5000,5000,3,NNG,*,*,*,*,*,*,*"); + writer.newLine(); + } + DictionaryBuilder.build(dir, dir, "utf-8", true); + } + + public void testLoadExternalTokenInfoDictionary() throws Exception { + String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/'); + TokenInfoDictionary dict = + new TokenInfoDictionary( + dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + FST_FILENAME_SUFFIX)); + assertNotNull(dict.getFST()); + } + + public void testLoadExternalUnknownDictionary() throws Exception { + String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/'); + UnknownDictionary dict = + new UnknownDictionary( + dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX)); + assertNotNull(dict.getCharacterDefinition()); + } + + public void testLoadExternalConnectionCosts() throws Exception { + String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/'); + ConnectionCosts cc = + new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX)); + assertEquals(0, cc.get(1, 1)); + } +} diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java index 406ce6c414e..39fc55065d9 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java @@ -16,7 +16,10 @@ */ package org.apache.lucene.analysis.ko.dict; -import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme; +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.DICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.POSDICT_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX; +import static org.apache.lucene.analysis.ko.dict.TokenInfoDictionary.FST_FILENAME_SUFFIX; import java.io.OutputStream; import java.io.OutputStreamWriter; @@ -76,7 +79,11 @@ public class TestTokenInfoDictionary extends LuceneTestCase { DictionaryBuilder.build(dir, dir, "utf-8", true); String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/'); // We must also load the other files (in BinaryDictionary) from the correct path - return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString()); + return new TokenInfoDictionary( + dir.resolve(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + POSDICT_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + DICT_FILENAME_SUFFIX), + dir.resolve(dictionaryPath + FST_FILENAME_SUFFIX)); } public void testPutException() {