LUCENE-10400: revise binary dictionaries' constructor in kuromoji (#643)

This commit is contained in:
Tomoko Uchida 2022-02-07 19:31:22 +09:00 committed by GitHub
parent e93b08f471
commit e7546c2427
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 235 additions and 118 deletions

View File

@ -75,6 +75,11 @@ API Changes
* LUCENE-10368: IntTaxonomyFacets has been deprecated and is no longer a supported extension point
for user-created faceting implementations. (Greg Miller)
* LUCENE-10400: Add constructors that take external resource Paths to dictionary classes in Kuromoji:
ConnectionCosts, TokenInfoDictionary, and UnknownDictionary. Old constructors that take resource scheme and
resource path in those classes are deprecated; These are replaced with the new constructors and planned to be
removed in a future release. (Tomoko Uchida, Uwe Schindler, Mike Sokolov)
* LUCENE-10050: Deprecate DrillSideways#search(Query, Collector) in favor of
DrillSideways#search(Query, CollectorManager). This reflects the change (LUCENE-10002) being made in
IndexSearcher#search that trends towards using CollectorManagers over Collectors. (Gautam Worah)

View File

@ -16,9 +16,10 @@
*/
package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.util.IOUtils.IOSupplier;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
@ -36,6 +37,7 @@ import org.apache.lucene.util.IntsRef;
public abstract class BinaryDictionary implements Dictionary {
/** Used to specify where (dictionary) resources get loaded from. */
@Deprecated(forRemoval = true, since = "9.1")
public enum ResourceScheme {
CLASSPATH,
FILE
@ -50,91 +52,38 @@ public abstract class BinaryDictionary implements Dictionary {
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
public static final int VERSION = 1;
private final ResourceScheme resourceScheme;
private final String resourcePath;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final String[] posDict;
private final String[] inflTypeDict;
private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
this(ResourceScheme.CLASSPATH, null);
}
/**
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
* scheme only, use this class's name as the path.
*/
protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath)
protected BinaryDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> posResource,
IOSupplier<InputStream> dictResource)
throws IOException {
this.resourceScheme = resourceScheme;
if (resourcePath == null) {
if (resourceScheme != ResourceScheme.CLASSPATH) {
throw new IllegalArgumentException(
"resourcePath must be supplied with FILE resource scheme");
}
this.resourcePath = getClass().getSimpleName();
} else {
if (resourceScheme == ResourceScheme.CLASSPATH && !resourcePath.startsWith("/")) {
resourcePath = "/".concat(resourcePath);
}
this.resourcePath = resourcePath;
}
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
String[] inflFormDict = null;
String[] inflTypeDict = null;
ByteBuffer buffer = null;
try (InputStream mapIS = new BufferedInputStream(getResource(TARGETMAP_FILENAME_SUFFIX));
InputStream posIS = new BufferedInputStream(getResource(POSDICT_FILENAME_SUFFIX));
// no buffering here, as we load in one large buffer
InputStream dictIS = getResource(DICT_FILENAME_SUFFIX)) {
DataInput in = new InputStreamDataInput(mapIS);
try (InputStream mapIS = new BufferedInputStream(targetMapResource.get())) {
final DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
targetMap = new int[in.readVInt()];
targetMapOffsets = new int[in.readVInt()];
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException(
"targetMap file format broken; targetMap.length="
+ targetMap.length
+ ", targetMapOffsets.length="
+ targetMapOffsets.length
+ ", sourceId="
+ sourceId);
targetMapOffsets[sourceId] = targetMap.length;
this.targetMap = new int[in.readVInt()];
this.targetMapOffsets = new int[in.readVInt()];
populateTargetMap(in, this.targetMap, this.targetMapOffsets);
}
in = new InputStreamDataInput(posIS);
try (InputStream posIS = new BufferedInputStream(posResource.get())) {
final DataInput in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
int posSize = in.readVInt();
posDict = new String[posSize];
inflTypeDict = new String[posSize];
inflFormDict = new String[posSize];
for (int j = 0; j < posSize; j++) {
posDict[j] = in.readString();
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
// this is how we encode null inflections
if (inflTypeDict[j].length() == 0) {
inflTypeDict[j] = null;
}
if (inflFormDict[j].length() == 0) {
inflFormDict[j] = null;
}
}
final int posSize = in.readVInt();
this.posDict = new String[posSize];
this.inflTypeDict = new String[posSize];
this.inflFormDict = new String[posSize];
populatePosDict(in, posSize, this.posDict, this.inflTypeDict, this.inflFormDict);
}
in = new InputStreamDataInput(dictIS);
// no buffering here, as we load in one large buffer
try (InputStream dictIS = dictResource.get()) {
final DataInput in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
@ -143,28 +92,51 @@ public abstract class BinaryDictionary implements Dictionary {
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
buffer = tmpBuffer.asReadOnlyBuffer();
}
this.targetMap = targetMap;
this.targetMapOffsets = targetMapOffsets;
this.posDict = posDict;
this.inflTypeDict = inflTypeDict;
this.inflFormDict = inflFormDict;
this.buffer = buffer;
}
protected final InputStream getResource(String suffix) throws IOException {
switch (resourceScheme) {
case CLASSPATH:
return getClassResource(resourcePath + suffix);
case FILE:
return Files.newInputStream(Paths.get(resourcePath + suffix));
default:
throw new IllegalStateException("unknown resource scheme " + resourceScheme);
this.buffer = tmpBuffer.asReadOnlyBuffer();
}
}
private static void populateTargetMap(DataInput in, int[] targetMap, int[] targetMapOffsets)
throws IOException {
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException(
"targetMap file format broken; targetMap.length="
+ targetMap.length
+ ", targetMapOffsets.length="
+ targetMapOffsets.length
+ ", sourceId="
+ sourceId);
targetMapOffsets[sourceId] = targetMap.length;
}
private static void populatePosDict(
DataInput in, int posSize, String[] posDict, String[] inflTypeDict, String[] inflFormDict)
throws IOException {
for (int j = 0; j < posSize; j++) {
posDict[j] = in.readString();
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
// this is how we encode null inflections
if (inflTypeDict[j].length() == 0) {
inflTypeDict[j] = null;
}
if (inflFormDict[j].length() == 0) {
inflFormDict[j] = null;
}
}
}
@Deprecated(forRemoval = true, since = "9.1")
public static final InputStream getResource(ResourceScheme scheme, String path)
throws IOException {
switch (scheme) {
@ -177,17 +149,7 @@ public abstract class BinaryDictionary implements Dictionary {
}
}
// util, reused by ConnectionCosts and CharacterDefinition
public static final InputStream getClassResource(Class<?> clazz, String suffix)
throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
if (is == null) {
throw new FileNotFoundException(
"Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
}
return is;
}
@Deprecated(forRemoval = true, since = "9.1")
private static InputStream getClassResource(String path) throws IOException {
return IOUtils.requireResourceNonNull(BinaryDictionary.class.getResourceAsStream(path), path);
}

View File

@ -22,6 +22,7 @@ import java.io.InputStream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
/** Character category data. */
public final class CharacterDefinition {
@ -68,8 +69,7 @@ public final class CharacterDefinition {
public static final byte KANJINUMERIC = (byte) CharacterClass.KANJINUMERIC.ordinal();
private CharacterDefinition() throws IOException {
try (InputStream is =
new BufferedInputStream(BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX))) {
try (InputStream is = new BufferedInputStream(getClassResource())) {
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
@ -81,6 +81,12 @@ public final class CharacterDefinition {
}
}
private static InputStream getClassResource() throws IOException {
final String resourcePath = CharacterDefinition.class.getSimpleName() + FILENAME_SUFFIX;
return IOUtils.requireResourceNonNull(
CharacterDefinition.class.getResourceAsStream(resourcePath), resourcePath);
}
public byte getCharacterClass(char c) {
return characterCategoryMap[c];
}

View File

@ -16,13 +16,19 @@
*/
package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.util.IOUtils.IOSupplier;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
/** n-gram connection cost data */
public final class ConnectionCosts {
@ -37,11 +43,33 @@ public final class ConnectionCosts {
/**
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
* @param path - where to load resources from, without the ".dat" suffix
* @deprecated replaced by {@link #ConnectionCosts(Path)}
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String path) throws IOException {
try (InputStream is =
new BufferedInputStream(
BinaryDictionary.getResource(scheme, "/" + path.replace('.', '/') + FILENAME_SUFFIX))) {
this(
scheme == BinaryDictionary.ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + FILENAME_SUFFIX))
: ConnectionCosts::getClassResource);
}
/**
* Create a {@link ConnectionCosts} from an external resource path.
*
* @param connectionCostsFile where to load connection costs resource
* @throws IOException if resource was not found or broken
*/
public ConnectionCosts(Path connectionCostsFile) throws IOException {
this(() -> Files.newInputStream(connectionCostsFile));
}
private ConnectionCosts() throws IOException {
this(ConnectionCosts::getClassResource);
}
private ConnectionCosts(IOSupplier<InputStream> connectionCostResource) throws IOException {
try (InputStream is = new BufferedInputStream(connectionCostResource.get())) {
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
forwardSize = in.readVInt();
@ -61,8 +89,10 @@ public final class ConnectionCosts {
}
}
private ConnectionCosts() throws IOException {
this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
private static InputStream getClassResource() throws IOException {
final String resourcePath = ConnectionCosts.class.getSimpleName() + FILENAME_SUFFIX;
return IOUtils.requireResourceNonNull(
ConnectionCosts.class.getResourceAsStream(resourcePath), resourcePath);
}
public int get(int forwardId, int backwardId) {

View File

@ -16,11 +16,17 @@
*/
package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.util.IOUtils.IOSupplier;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -38,12 +44,62 @@ public final class TokenInfoDictionary extends BinaryDictionary {
* @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH
* scheme only, use this class's name as the path.
* @deprecated replaced by {@link #TokenInfoDictionary(Path, Path, Path, Path)}
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath)
throws IOException {
super(resourceScheme, resourcePath);
this(
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + TARGETMAP_FILENAME_SUFFIX))
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + POSDICT_FILENAME_SUFFIX))
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + DICT_FILENAME_SUFFIX))
: () -> getClassResource(DICT_FILENAME_SUFFIX),
resourceScheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(resourcePath + FST_FILENAME_SUFFIX))
: () -> getClassResource(FST_FILENAME_SUFFIX));
}
/**
* Create a {@link TokenInfoDictionary} from an external resource path.
*
* @param targetMapFile where to load target map resource
* @param posDictFile where to load POS dictionary resource
* @param dictFile where to load dictionary entries resource
* @param fstFile where to load encoded FST data resource
* @throws IOException if resource was not found or broken
*/
public TokenInfoDictionary(Path targetMapFile, Path posDictFile, Path dictFile, Path fstFile)
throws IOException {
this(
() -> Files.newInputStream(targetMapFile),
() -> Files.newInputStream(posDictFile),
() -> Files.newInputStream(dictFile),
() -> Files.newInputStream(fstFile));
}
private TokenInfoDictionary() throws IOException {
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
() -> getClassResource(DICT_FILENAME_SUFFIX),
() -> getClassResource(FST_FILENAME_SUFFIX));
}
private TokenInfoDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> posResource,
IOSupplier<InputStream> dictResource,
IOSupplier<InputStream> fstResource)
throws IOException {
super(targetMapResource, posResource, dictResource);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
@ -51,8 +107,10 @@ public final class TokenInfoDictionary extends BinaryDictionary {
this.fst = new TokenInfoFST(fst, true);
}
private TokenInfoDictionary() throws IOException {
this(ResourceScheme.CLASSPATH, null);
private static InputStream getClassResource(String suffix) throws IOException {
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
return IOUtils.requireResourceNonNull(
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);
}
public TokenInfoFST getFST() {

View File

@ -17,6 +17,11 @@
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.util.IOUtils;
/** Dictionary for unknown-word handling. */
public final class UnknownDictionary extends BinaryDictionary {
@ -27,13 +32,49 @@ public final class UnknownDictionary extends BinaryDictionary {
* @param scheme scheme for loading resources (FILE or CLASSPATH).
* @param path where to load resources from; a path, including the file base name without
* extension; this is used to match multiple files with the same base name.
* @deprecated replaced by {@link #UnknownDictionary(Path, Path, Path)}
*/
@Deprecated(forRemoval = true, since = "9.1")
@SuppressWarnings("removal")
public UnknownDictionary(ResourceScheme scheme, String path) throws IOException {
super(scheme, path);
super(
scheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + TARGETMAP_FILENAME_SUFFIX))
: () -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
scheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + POSDICT_FILENAME_SUFFIX))
: () -> getClassResource(POSDICT_FILENAME_SUFFIX),
scheme == ResourceScheme.FILE
? () -> Files.newInputStream(Paths.get(path + DICT_FILENAME_SUFFIX))
: () -> getClassResource(DICT_FILENAME_SUFFIX));
}
/**
* Create a {@link UnknownDictionary} from an external resource path.
*
* @param targetMapFile where to load target map resource
* @param posDictFile where to load POS dictionary resource
* @param dictFile where to load dictionary entries resource
* @throws IOException if resource was not found or broken
*/
public UnknownDictionary(Path targetMapFile, Path posDictFile, Path dictFile) throws IOException {
super(
() -> Files.newInputStream(targetMapFile),
() -> Files.newInputStream(posDictFile),
() -> Files.newInputStream(dictFile));
}
private UnknownDictionary() throws IOException {
super();
super(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
() -> getClassResource(POSDICT_FILENAME_SUFFIX),
() -> getClassResource(DICT_FILENAME_SUFFIX));
}
private static InputStream getClassResource(String suffix) throws IOException {
final String resourcePath = UnknownDictionary.class.getSimpleName() + suffix;
return IOUtils.requireResourceNonNull(
UnknownDictionary.class.getResourceAsStream(resourcePath), resourcePath);
}
public int lookup(char[] text, int offset, int len) {

View File

@ -491,6 +491,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
}
// Make sure loading custom dictionaries from classpath works:
@SuppressWarnings("removal")
public void testCustomDictionary() throws Exception {
Tokenizer tokenizer =
new JapaneseTokenizer(

View File

@ -58,6 +58,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
assertEquals(2, dict.getWordCost(wordId));
}
@SuppressWarnings("removal")
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
Path dir = createTempDir();
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));

View File

@ -526,4 +526,17 @@ public final class IOUtils {
public interface IOFunction<T, R> {
R apply(T t) throws IOException;
}
/**
* A resource supplier function that may throw an IOException.
*
* <p>Note that this would open a resource such as a File. Consumers should make sure to close the
* resource (e.g., use try-with-resources)
*
* @see java.util.function.Supplier
*/
@FunctionalInterface
public interface IOSupplier<T> {
T get() throws IOException;
}
}