mirror of https://github.com/apache/lucene.git
LUCENE-10400: revise binary dictionaries' constructor in kuromoji (#643)
This commit is contained in:
parent
e93b08f471
commit
e7546c2427
|
@ -75,6 +75,11 @@ API Changes
|
|||
* LUCENE-10368: IntTaxonomyFacets has been deprecated and is no longer a supported extension point
|
||||
for user-created faceting implementations. (Greg Miller)
|
||||
|
||||
* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji:
|
||||
ConnectionCosts, TokenInfoDictionary, and UnknownDictionary. Old constructors that take resource scheme and
|
||||
resource path in those classes are deprecated; These are replaced with the new constructors and planned to be
|
||||
removed in a future release. (Tomoko Uchida, Uwe Schindler, Mike Sokolov)
|
||||
|
||||
* LUCENE-10050: Deprecate DrillSideways#search(Query, Collector) in favor of
|
||||
DrillSideways#search(Query, CollectorManager). This reflects the change (LUCENE-10002) being made in
|
||||
IndexSearcher#search that trends towards using CollectorManagers over Collectors. (Gautam Worah)
|
||||
|
|
|
@ -16,9 +16,10 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.ja.dict;
|
||||
|
||||
import static org.apache.lucene.util.IOUtils.IOSupplier;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.EOFException;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
|
@ -36,6 +37,7 @@ import org.apache.lucene.util.IntsRef;
|
|||
public abstract class BinaryDictionary implements Dictionary {
|
||||
|
||||
/** Used to specify where (dictionary) resources get loaded from. */
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
public enum ResourceScheme {
|
||||
CLASSPATH,
|
||||
FILE
|
||||
|
@ -50,91 +52,38 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
|
||||
public static final int VERSION = 1;
|
||||
|
||||
private final ResourceScheme resourceScheme;
|
||||
private final String resourcePath;
|
||||
private final ByteBuffer buffer;
|
||||
private final int[] targetMapOffsets, targetMap;
|
||||
private final String[] posDict;
|
||||
private final String[] inflTypeDict;
|
||||
private final String[] inflFormDict;
|
||||
|
||||
protected BinaryDictionary() throws IOException {
|
||||
this(ResourceScheme.CLASSPATH, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
|
||||
* scheme only, use this class's name as the path.
|
||||
*/
|
||||
protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath)
|
||||
protected BinaryDictionary(
|
||||
IOSupplier<InputStream> targetMapResource,
|
||||
IOSupplier<InputStream> posResource,
|
||||
IOSupplier<InputStream> dictResource)
|
||||
throws IOException {
|
||||
this.resourceScheme = resourceScheme;
|
||||
if (resourcePath == null) {
|
||||
if (resourceScheme != ResourceScheme.CLASSPATH) {
|
||||
throw new IllegalArgumentException(
|
||||
"resourcePath must be supplied with FILE resource scheme");
|
||||
}
|
||||
this.resourcePath = getClass().getSimpleName();
|
||||
} else {
|
||||
if (resourceScheme == ResourceScheme.CLASSPATH && !resourcePath.startsWith("/")) {
|
||||
resourcePath = "/".concat(resourcePath);
|
||||
}
|
||||
this.resourcePath = resourcePath;
|
||||
}
|
||||
int[] targetMapOffsets = null, targetMap = null;
|
||||
String[] posDict = null;
|
||||
String[] inflFormDict = null;
|
||||
String[] inflTypeDict = null;
|
||||
ByteBuffer buffer = null;
|
||||
try (InputStream mapIS = new BufferedInputStream(getResource(TARGETMAP_FILENAME_SUFFIX));
|
||||
InputStream posIS = new BufferedInputStream(getResource(POSDICT_FILENAME_SUFFIX));
|
||||
// no buffering here, as we load in one large buffer
|
||||
InputStream dictIS = getResource(DICT_FILENAME_SUFFIX)) {
|
||||
DataInput in = new InputStreamDataInput(mapIS);
|
||||
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
|
||||
final DataInput in = new InputStreamDataInput(mapIS);
|
||||
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
|
||||
targetMap = new int[in.readVInt()];
|
||||
targetMapOffsets = new int[in.readVInt()];
|
||||
int accum = 0, sourceId = 0;
|
||||
for (int ofs = 0; ofs < targetMap.length; ofs++) {
|
||||
final int val = in.readVInt();
|
||||
if ((val & 0x01) != 0) {
|
||||
targetMapOffsets[sourceId] = ofs;
|
||||
sourceId++;
|
||||
}
|
||||
accum += val >>> 1;
|
||||
targetMap[ofs] = accum;
|
||||
}
|
||||
if (sourceId + 1 != targetMapOffsets.length)
|
||||
throw new IOException(
|
||||
"targetMap file format broken; targetMap.length="
|
||||
+ targetMap.length
|
||||
+ ", targetMapOffsets.length="
|
||||
+ targetMapOffsets.length
|
||||
+ ", sourceId="
|
||||
+ sourceId);
|
||||
targetMapOffsets[sourceId] = targetMap.length;
|
||||
this.targetMap = new int[in.readVInt()];
|
||||
this.targetMapOffsets = new int[in.readVInt()];
|
||||
populateTargetMap(in, this.targetMap, this.targetMapOffsets);
|
||||
}
|
||||
|
||||
in = new InputStreamDataInput(posIS);
|
||||
try (InputStream posIS = new BufferedInputStream(posResource.get())) {
|
||||
final DataInput in = new InputStreamDataInput(posIS);
|
||||
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
|
||||
int posSize = in.readVInt();
|
||||
posDict = new String[posSize];
|
||||
inflTypeDict = new String[posSize];
|
||||
inflFormDict = new String[posSize];
|
||||
for (int j = 0; j < posSize; j++) {
|
||||
posDict[j] = in.readString();
|
||||
inflTypeDict[j] = in.readString();
|
||||
inflFormDict[j] = in.readString();
|
||||
// this is how we encode null inflections
|
||||
if (inflTypeDict[j].length() == 0) {
|
||||
inflTypeDict[j] = null;
|
||||
}
|
||||
if (inflFormDict[j].length() == 0) {
|
||||
inflFormDict[j] = null;
|
||||
}
|
||||
}
|
||||
final int posSize = in.readVInt();
|
||||
this.posDict = new String[posSize];
|
||||
this.inflTypeDict = new String[posSize];
|
||||
this.inflFormDict = new String[posSize];
|
||||
populatePosDict(in, posSize, this.posDict, this.inflTypeDict, this.inflFormDict);
|
||||
}
|
||||
|
||||
in = new InputStreamDataInput(dictIS);
|
||||
// no buffering here, as we load in one large buffer
|
||||
try (InputStream dictIS = dictResource.get()) {
|
||||
final DataInput in = new InputStreamDataInput(dictIS);
|
||||
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
|
||||
final int size = in.readVInt();
|
||||
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
|
||||
|
@ -143,28 +92,51 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
if (read != size) {
|
||||
throw new EOFException("Cannot read whole dictionary");
|
||||
}
|
||||
buffer = tmpBuffer.asReadOnlyBuffer();
|
||||
}
|
||||
|
||||
this.targetMap = targetMap;
|
||||
this.targetMapOffsets = targetMapOffsets;
|
||||
this.posDict = posDict;
|
||||
this.inflTypeDict = inflTypeDict;
|
||||
this.inflFormDict = inflFormDict;
|
||||
this.buffer = buffer;
|
||||
}
|
||||
|
||||
protected final InputStream getResource(String suffix) throws IOException {
|
||||
switch (resourceScheme) {
|
||||
case CLASSPATH:
|
||||
return getClassResource(resourcePath + suffix);
|
||||
case FILE:
|
||||
return Files.newInputStream(Paths.get(resourcePath + suffix));
|
||||
default:
|
||||
throw new IllegalStateException("unknown resource scheme " + resourceScheme);
|
||||
this.buffer = tmpBuffer.asReadOnlyBuffer();
|
||||
}
|
||||
}
|
||||
|
||||
private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
|
||||
throws IOException {
|
||||
int accum = 0, sourceId = 0;
|
||||
for (int ofs = 0; ofs < targetMap.length; ofs++) {
|
||||
final int val = in.readVInt();
|
||||
if ((val & 0x01) != 0) {
|
||||
targetMapOffsets[sourceId] = ofs;
|
||||
sourceId++;
|
||||
}
|
||||
accum += val >>> 1;
|
||||
targetMap[ofs] = accum;
|
||||
}
|
||||
if (sourceId + 1 != targetMapOffsets.length)
|
||||
throw new IOException(
|
||||
"targetMap file format broken; targetMap.length="
|
||||
+ targetMap.length
|
||||
+ ", targetMapOffsets.length="
|
||||
+ targetMapOffsets.length
|
||||
+ ", sourceId="
|
||||
+ sourceId);
|
||||
targetMapOffsets[sourceId] = targetMap.length;
|
||||
}
|
||||
|
||||
private static void populatePosDict(
|
||||
DataInput in, int posSize, String[] posDict, String[] inflTypeDict, String[] inflFormDict)
|
||||
throws IOException {
|
||||
for (int j = 0; j < posSize; j++) {
|
||||
posDict[j] = in.readString();
|
||||
inflTypeDict[j] = in.readString();
|
||||
inflFormDict[j] = in.readString();
|
||||
// this is how we encode null inflections
|
||||
if (inflTypeDict[j].length() == 0) {
|
||||
inflTypeDict[j] = null;
|
||||
}
|
||||
if (inflFormDict[j].length() == 0) {
|
||||
inflFormDict[j] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
public static final InputStream getResource(ResourceScheme scheme, String path)
|
||||
throws IOException {
|
||||
switch (scheme) {
|
||||
|
@ -177,17 +149,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
// util, reused by ConnectionCosts and CharacterDefinition
|
||||
public static final InputStream getClassResource(Class<?> clazz, String suffix)
|
||||
throws IOException {
|
||||
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
|
||||
if (is == null) {
|
||||
throw new FileNotFoundException(
|
||||
"Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
private static InputStream getClassResource(String path) throws IOException {
|
||||
return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path);
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
|||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Character category data. */
|
||||
public final class CharacterDefinition {
|
||||
|
@ -68,8 +69,7 @@ public final class CharacterDefinition {
|
|||
public static final byte KANJINUMERIC = (byte) CharacterClass.KANJINUMERIC.ordinal();
|
||||
|
||||
private CharacterDefinition() throws IOException {
|
||||
try (InputStream is =
|
||||
new BufferedInputStream(BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX))) {
|
||||
try (InputStream is = new BufferedInputStream(getClassResource())) {
|
||||
final DataInput in = new InputStreamDataInput(is);
|
||||
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
|
||||
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
|
||||
|
@ -81,6 +81,12 @@ public final class CharacterDefinition {
|
|||
}
|
||||
}
|
||||
|
||||
private static InputStream getClassResource() throws IOException {
|
||||
final String resourcePath = CharacterDefinition.class.getSimpleName() + FILENAME_SUFFIX;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public byte getCharacterClass(char c) {
|
||||
return characterCategoryMap[c];
|
||||
}
|
||||
|
|
|
@ -16,13 +16,19 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.ja.dict;
|
||||
|
||||
import static org.apache.lucene.util.IOUtils.IOSupplier;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** n-gram connection cost data */
|
||||
public final class ConnectionCosts {
|
||||
|
@ -37,11 +43,33 @@ public final class ConnectionCosts {
|
|||
/**
|
||||
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param path - where to load resources from, without the ".dat" suffix
|
||||
* @deprecated replaced by {@link #ConnectionCosts(Path)}
|
||||
*/
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
@SuppressWarnings("removal")
|
||||
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String path) throws IOException {
|
||||
try (InputStream is =
|
||||
new BufferedInputStream(
|
||||
BinaryDictionary.getResource(scheme, "/" + path.replace('.', '/') + FILENAME_SUFFIX))) {
|
||||
this(
|
||||
scheme == BinaryDictionary.ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(path + FILENAME_SUFFIX))
|
||||
: ConnectionCosts::getClassResource);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link ConnectionCosts} from an external resource path.
|
||||
*
|
||||
* @param connectionCostsFile where to load connection costs resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public ConnectionCosts(Path connectionCostsFile) throws IOException {
|
||||
this(() -> Files.newInputStream(connectionCostsFile));
|
||||
}
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(ConnectionCosts::getClassResource);
|
||||
}
|
||||
|
||||
private ConnectionCosts(IOSupplier<InputStream> connectionCostResource) throws IOException {
|
||||
try (InputStream is = new BufferedInputStream(connectionCostResource.get())) {
|
||||
final DataInput in = new InputStreamDataInput(is);
|
||||
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
|
||||
forwardSize = in.readVInt();
|
||||
|
@ -61,8 +89,10 @@ public final class ConnectionCosts {
|
|||
}
|
||||
}
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
|
||||
private static InputStream getClassResource() throws IOException {
|
||||
final String resourcePath = ConnectionCosts.class.getSimpleName() + FILENAME_SUFFIX;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public int get(int forwardId, int backwardId) {
|
||||
|
|
|
@ -16,11 +16,17 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.ja.dict;
|
||||
|
||||
import static org.apache.lucene.util.IOUtils.IOSupplier;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
||||
|
@ -38,12 +44,62 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
|||
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
|
||||
* scheme only, use this class's name as the path.
|
||||
* @deprecated replaced by {@link #TokenInfoDictionary(Path, Path, Path, Path)}
|
||||
*/
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
@SuppressWarnings("removal")
|
||||
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath)
|
||||
throws IOException {
|
||||
super(resourceScheme, resourcePath);
|
||||
this(
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(DICT_FILENAME_SUFFIX),
|
||||
resourceScheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(FST_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link TokenInfoDictionary} from an external resource path.
|
||||
*
|
||||
* @param targetMapFile where to load target map resource
|
||||
* @param posDictFile where to load POS dictionary resource
|
||||
* @param dictFile where to load dictionary entries resource
|
||||
* @param fstFile where to load encoded FST data resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile)
|
||||
throws IOException {
|
||||
this(
|
||||
() -> Files.newInputStream(targetMapFile),
|
||||
() -> Files.newInputStream(posDictFile),
|
||||
() -> Files.newInputStream(dictFile),
|
||||
() -> Files.newInputStream(fstFile));
|
||||
}
|
||||
|
||||
private TokenInfoDictionary() throws IOException {
|
||||
this(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
() -> getClassResource(DICT_FILENAME_SUFFIX),
|
||||
() -> getClassResource(FST_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
private TokenInfoDictionary(
|
||||
IOSupplier<InputStream> targetMapResource,
|
||||
IOSupplier<InputStream> posResource,
|
||||
IOSupplier<InputStream> dictResource,
|
||||
IOSupplier<InputStream> fstResource)
|
||||
throws IOException {
|
||||
super(targetMapResource, posResource, dictResource);
|
||||
FST<Long> fst;
|
||||
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
|
||||
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
||||
DataInput in = new InputStreamDataInput(is);
|
||||
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
|
||||
}
|
||||
|
@ -51,8 +107,10 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
|||
this.fst = new TokenInfoFST(fst, true);
|
||||
}
|
||||
|
||||
private TokenInfoDictionary() throws IOException {
|
||||
this(ResourceScheme.CLASSPATH, null);
|
||||
private static InputStream getClassResource(String suffix) throws IOException {
|
||||
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public TokenInfoFST getFST() {
|
||||
|
|
|
@ -17,6 +17,11 @@
|
|||
package org.apache.lucene.analysis.ja.dict;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Dictionary for unknown-word handling. */
|
||||
public final class UnknownDictionary extends BinaryDictionary {
|
||||
|
@ -27,13 +32,49 @@ public final class UnknownDictionary extends BinaryDictionary {
|
|||
* @param scheme scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param path where to load resources from; a path, including the file base name without
|
||||
* extension; this is used to match multiple files with the same base name.
|
||||
* @deprecated replaced by {@link #UnknownDictionary(Path, Path, Path)}
|
||||
*/
|
||||
@Deprecated(forRemoval = true, since = "9.1")
|
||||
@SuppressWarnings("removal")
|
||||
public UnknownDictionary(ResourceScheme scheme, String path) throws IOException {
|
||||
super(scheme, path);
|
||||
super(
|
||||
scheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(path + TARGETMAP_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
scheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(path + POSDICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
scheme == ResourceScheme.FILE
|
||||
? () -> Files.newInputStream(Paths.get(path + DICT_FILENAME_SUFFIX))
|
||||
: () -> getClassResource(DICT_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link UnknownDictionary} from an external resource path.
|
||||
*
|
||||
* @param targetMapFile where to load target map resource
|
||||
* @param posDictFile where to load POS dictionary resource
|
||||
* @param dictFile where to load dictionary entries resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException {
|
||||
super(
|
||||
() -> Files.newInputStream(targetMapFile),
|
||||
() -> Files.newInputStream(posDictFile),
|
||||
() -> Files.newInputStream(dictFile));
|
||||
}
|
||||
|
||||
private UnknownDictionary() throws IOException {
|
||||
super();
|
||||
super(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
|
||||
() -> getClassResource(DICT_FILENAME_SUFFIX));
|
||||
}
|
||||
|
||||
private static InputStream getClassResource(String suffix) throws IOException {
|
||||
final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix;
|
||||
return IOUtils.requireResourceNonNull(
|
||||
UnknownDictionary.class.getResourceAsStream(resourcePath), resourcePath);
|
||||
}
|
||||
|
||||
public int lookup(char[] text, int offset, int len) {
|
||||
|
|
|
@ -491,6 +491,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
// Make sure loading custom dictionaries from classpath works:
|
||||
@SuppressWarnings("removal")
|
||||
public void testCustomDictionary() throws Exception {
|
||||
Tokenizer tokenizer =
|
||||
new JapaneseTokenizer(
|
||||
|
|
|
@ -58,6 +58,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
|||
assertEquals(2, dict.getWordCost(wordId));
|
||||
}
|
||||
|
||||
@SuppressWarnings("removal")
|
||||
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
|
||||
Path dir = createTempDir();
|
||||
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
|
||||
|
|
|
@ -526,4 +526,17 @@ public final class IOUtils {
|
|||
public interface IOFunction<T, R> {
|
||||
R apply(T t) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* A resource supplier function that may throw an IOException.
|
||||
*
|
||||
* <p>Note that this would open a resource such as a File. Consumers should make sure to close the
|
||||
* resource (e.g., use try-with-resources)
|
||||
*
|
||||
* @see java.util.function.Supplier
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface IOSupplier<T> {
|
||||
T get() throws IOException;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue