LUCENE-10558: Implement URL ctor to support classpath/module usage in Kuromoji and Nori dictionaries (main branch) (#871)

This commit is contained in:
Uwe Schindler 2022-05-06 16:49:56 +02:00 committed by GitHub
parent 5f832c64bf
commit 8aa4a56491
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 174 additions and 0 deletions

View File

@ -73,6 +73,11 @@ API Changes
taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older
checkpoints. (Gautam Worah)
* LUCENE-10558: Add new constructors to Kuromoji and Nori dictionary classes to support classpath /
module system usage. It is now possible to use JDK's Class/ClassLoader/Module#getResource(...) apis
and pass their returned URL to dictionary constructors to load resources from Classpath or Module
resources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov)
New Features
---------------------
@ -169,6 +174,10 @@ Bug Fixes
* LUCENE-10552: KnnVectorQuery has incorrect equals/ hashCode. (Lu Xugang)
* LUCENE-10558: Restore behaviour of deprecated Kuromoji and Nori dictionary constructors for
custom dictionary support. Please also use new URL-based constructors for classpath/module
system ressources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov)
Build
---------------------

View File

@ -64,6 +64,19 @@ the [Log4j JDK Logging Adapter](https://logging.apache.org/log4j/2.x/log4j-jul/i
in combination with the corresponding system property:
`java.util.logging.manager=org.apache.logging.log4j.jul.LogManager`.
### Kuromoji and Nori analysis component constructors for custom dictionaries
The Kuromoji and Nori analysis modules had some way to customize the backing dictionaries
by passing a path to file or classpath resources using some inconsistently implemented
APIs. This was buggy from the beginning, but some users made use of it. Due to move to Java
module system, especially the resource lookup on classpath stopped to work correctly.
The Lucene team therefore implemented new APIs to create dictionary implementations
with custom data files. Unfortunately there were some shortcomings in the 9.1 version,
also when using the now deprecated ctors, so users are advised to upgrade to
Lucene 9.2 or stay with 9.0.
See LUCENE-10558 for more details and workarounds.
## Migration from Lucene 8.x to Lucene 9.0
### Rename of binary artifacts from '**-analyzers-**' to '**-analysis-**' (LUCENE-9562)

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.util.IOSupplier;
@ -36,6 +37,17 @@ public final class ConnectionCosts extends org.apache.lucene.analysis.morph.Conn
this(() -> Files.newInputStream(connectionCostsFile));
}
/**
* Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link
* ClassLoader#getResource(String)}).
*
* @param connectionCostsUrl where to load connection costs resource
* @throws IOException if resource was not found or broken
*/
public ConnectionCosts(URL connectionCostsUrl) throws IOException {
this(() -> connectionCostsUrl.openStream());
}
private ConnectionCosts() throws IOException {
this(ConnectionCosts::getClassResource);
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ja.dict;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
@ -58,6 +59,25 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
() -> Files.newInputStream(fstFile));
}
/**
* Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with
* {@link ClassLoader#getResource(String)}).
*
* @param targetMapUrl where to load target map resource
* @param posDictUrl where to load POS dictionary resource
* @param dictUrl where to load dictionary entries resource
* @param fstUrl where to load encoded FST data resource
* @throws IOException if resource was not found or broken
*/
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
throws IOException {
this(
() -> targetMapUrl.openStream(),
() -> posDictUrl.openStream(),
() -> dictUrl.openStream(),
() -> fstUrl.openStream());
}
private TokenInfoDictionary() throws IOException {
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
@ -45,6 +46,20 @@ public final class UnknownDictionary extends BinaryDictionary<UnknownMorphData>
() -> Files.newInputStream(dictFile));
}
/**
* Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with
* {@link ClassLoader#getResource(String)}).
*
* @param targetMapUrl where to load target map resource
* @param posDictUrl where to load POS dictionary resource
* @param dictUrl where to load dictionary entries resource
* @throws IOException if resource was not found or broken
*/
public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException {
this(
() -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream());
}
private UnknownDictionary() throws IOException {
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),

View File

@ -31,6 +31,7 @@ import org.junit.Before;
public class TestExternalDictionary extends LuceneTestCase {
private Path dir;
private ClassLoader loader = getClass().getClassLoader();
@Override
@Before
@ -97,4 +98,32 @@ public class TestExternalDictionary extends LuceneTestCase {
new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
assertEquals(1, cc.get(0, 1));
}
public void testLoadExternalUrlTokenInfoDictionary() throws Exception {
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
TokenInfoDictionary dict =
new TokenInfoDictionary(
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX));
assertNotNull(dict.getFST());
}
public void testLoadExternalUrlUnknownDictionary() throws Exception {
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
UnknownDictionary dict =
new UnknownDictionary(
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX));
assertNotNull(dict.getCharacterDefinition());
}
public void testLoadExternalUrlConnectionCosts() throws Exception {
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
ConnectionCosts cc =
new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
assertEquals(1, cc.get(0, 1));
}
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.util.IOSupplier;
@ -36,6 +37,17 @@ public final class ConnectionCosts extends org.apache.lucene.analysis.morph.Conn
this(() -> Files.newInputStream(connectionCostsFile));
}
/**
* Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link
* ClassLoader#getResource(String)}).
*
* @param connectionCostsUrl where to load connection costs resource
* @throws IOException if resource was not found or broken
*/
public ConnectionCosts(URL connectionCostsUrl) throws IOException {
this(() -> connectionCostsUrl.openStream());
}
private ConnectionCosts() throws IOException {
this(ConnectionCosts::getClassResource);
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ko.dict;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
@ -66,6 +67,25 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
() -> Files.newInputStream(fstFile));
}
/**
* Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with
* {@link ClassLoader#getResource(String)}).
*
* @param targetMapUrl where to load target map resource
* @param posDictUrl where to load POS dictionary resource
* @param dictUrl where to load dictionary entries resource
* @param fstUrl where to load encoded FST data resource
* @throws IOException if resource was not found or broken
*/
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
throws IOException {
this(
() -> targetMapUrl.openStream(),
() -> posDictUrl.openStream(),
() -> dictUrl.openStream(),
() -> fstUrl.openStream());
}
private TokenInfoDictionary(
IOSupplier<InputStream> targetMapResource,
IOSupplier<InputStream> posResource,

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.morph.BinaryDictionary;
@ -44,6 +45,20 @@ public final class UnknownDictionary extends BinaryDictionary<UnknownMorphData>
() -> Files.newInputStream(dictFile));
}
/**
* Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with
* {@link ClassLoader#getResource(String)}).
*
* @param targetMapUrl where to load target map resource
* @param posDictUrl where to load POS dictionary resource
* @param dictUrl where to load dictionary entries resource
* @throws IOException if resource was not found or broken
*/
public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException {
this(
() -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream());
}
private UnknownDictionary() throws IOException {
this(
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),

View File

@ -31,6 +31,7 @@ import org.junit.Before;
public class TestExternalDictionary extends LuceneTestCase {
private Path dir;
private ClassLoader loader = getClass().getClassLoader();
@Override
@Before
@ -97,4 +98,32 @@ public class TestExternalDictionary extends LuceneTestCase {
new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
assertEquals(0, cc.get(1, 1));
}
public void testLoadExternalUrlTokenInfoDictionary() throws Exception {
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
TokenInfoDictionary dict =
new TokenInfoDictionary(
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX));
assertNotNull(dict.getFST());
}
public void testLoadExternalUrlUnknownDictionary() throws Exception {
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
UnknownDictionary dict =
new UnknownDictionary(
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX));
assertNotNull(dict.getCharacterDefinition());
}
public void testLoadExternalUrlConnectionCosts() throws Exception {
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
ConnectionCosts cc =
new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
assertEquals(0, cc.get(1, 1));
}
}