mirror of https://github.com/apache/lucene.git
LUCENE-10558: Implement URL ctor to support classpath/module usage in Kuromoji and Nori dictionaries (main branch) (#871)
This commit is contained in:
parent
5f832c64bf
commit
8aa4a56491
|
@ -73,6 +73,11 @@ API Changes
|
|||
taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older
|
||||
checkpoints. (Gautam Worah)
|
||||
|
||||
* LUCENE-10558: Add new constructors to Kuromoji and Nori dictionary classes to support classpath /
|
||||
module system usage. It is now possible to use JDK's Class/ClassLoader/Module#getResource(...) apis
|
||||
and pass their returned URL to dictionary constructors to load resources from Classpath or Module
|
||||
resources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -169,6 +174,10 @@ Bug Fixes
|
|||
|
||||
* LUCENE-10552: KnnVectorQuery has incorrect equals/ hashCode. (Lu Xugang)
|
||||
|
||||
* LUCENE-10558: Restore behaviour of deprecated Kuromoji and Nori dictionary constructors for
|
||||
custom dictionary support. Please also use new URL-based constructors for classpath/module
|
||||
system ressources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -64,6 +64,19 @@ the [Log4j JDK Logging Adapter](https://logging.apache.org/log4j/2.x/log4j-jul/i
|
|||
in combination with the corresponding system property:
|
||||
`java.util.logging.manager=org.apache.logging.log4j.jul.LogManager`.
|
||||
|
||||
### Kuromoji and Nori analysis component constructors for custom dictionaries
|
||||
|
||||
The Kuromoji and Nori analysis modules had some way to customize the backing dictionaries
|
||||
by passing a path to file or classpath resources using some inconsistently implemented
|
||||
APIs. This was buggy from the beginning, but some users made use of it. Due to move to Java
|
||||
module system, especially the resource lookup on classpath stopped to work correctly.
|
||||
The Lucene team therefore implemented new APIs to create dictionary implementations
|
||||
with custom data files. Unfortunately there were some shortcomings in the 9.1 version,
|
||||
also when using the now deprecated ctors, so users are advised to upgrade to
|
||||
Lucene 9.2 or stay with 9.0.
|
||||
|
||||
See LUCENE-10558 for more details and workarounds.
|
||||
|
||||
## Migration from Lucene 8.x to Lucene 9.0
|
||||
|
||||
### Rename of binary artifacts from '**-analyzers-**' to '**-analysis-**' (LUCENE-9562)
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
|
@ -36,6 +37,17 @@ public final class ConnectionCosts extends org.apache.lucene.analysis.morph.Conn
|
|||
this(() -> Files.newInputStream(connectionCostsFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link
|
||||
* ClassLoader#getResource(String)}).
|
||||
*
|
||||
* @param connectionCostsUrl where to load connection costs resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public ConnectionCosts(URL connectionCostsUrl) throws IOException {
|
||||
this(() -> connectionCostsUrl.openStream());
|
||||
}
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(ConnectionCosts::getClassResource);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ja.dict;
|
|||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.analysis.morph.BinaryDictionary;
|
||||
|
@ -58,6 +59,25 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
|
|||
() -> Files.newInputStream(fstFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with
|
||||
* {@link ClassLoader#getResource(String)}).
|
||||
*
|
||||
* @param targetMapUrl where to load target map resource
|
||||
* @param posDictUrl where to load POS dictionary resource
|
||||
* @param dictUrl where to load dictionary entries resource
|
||||
* @param fstUrl where to load encoded FST data resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
|
||||
throws IOException {
|
||||
this(
|
||||
() -> targetMapUrl.openStream(),
|
||||
() -> posDictUrl.openStream(),
|
||||
() -> dictUrl.openStream(),
|
||||
() -> fstUrl.openStream());
|
||||
}
|
||||
|
||||
private TokenInfoDictionary() throws IOException {
|
||||
this(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.analysis.morph.BinaryDictionary;
|
||||
|
@ -45,6 +46,20 @@ public final class UnknownDictionary extends BinaryDictionary<UnknownMorphData>
|
|||
() -> Files.newInputStream(dictFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with
|
||||
* {@link ClassLoader#getResource(String)}).
|
||||
*
|
||||
* @param targetMapUrl where to load target map resource
|
||||
* @param posDictUrl where to load POS dictionary resource
|
||||
* @param dictUrl where to load dictionary entries resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException {
|
||||
this(
|
||||
() -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream());
|
||||
}
|
||||
|
||||
private UnknownDictionary() throws IOException {
|
||||
this(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.junit.Before;
|
|||
public class TestExternalDictionary extends LuceneTestCase {
|
||||
|
||||
private Path dir;
|
||||
private ClassLoader loader = getClass().getClassLoader();
|
||||
|
||||
@Override
|
||||
@Before
|
||||
|
@ -97,4 +98,32 @@ public class TestExternalDictionary extends LuceneTestCase {
|
|||
new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
|
||||
assertEquals(1, cc.get(0, 1));
|
||||
}
|
||||
|
||||
public void testLoadExternalUrlTokenInfoDictionary() throws Exception {
|
||||
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
|
||||
TokenInfoDictionary dict =
|
||||
new TokenInfoDictionary(
|
||||
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX));
|
||||
assertNotNull(dict.getFST());
|
||||
}
|
||||
|
||||
public void testLoadExternalUrlUnknownDictionary() throws Exception {
|
||||
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
|
||||
UnknownDictionary dict =
|
||||
new UnknownDictionary(
|
||||
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX));
|
||||
assertNotNull(dict.getCharacterDefinition());
|
||||
}
|
||||
|
||||
public void testLoadExternalUrlConnectionCosts() throws Exception {
|
||||
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
|
||||
ConnectionCosts cc =
|
||||
new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
|
||||
assertEquals(1, cc.get(0, 1));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko.dict;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
|
@ -36,6 +37,17 @@ public final class ConnectionCosts extends org.apache.lucene.analysis.morph.Conn
|
|||
this(() -> Files.newInputStream(connectionCostsFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link
|
||||
* ClassLoader#getResource(String)}).
|
||||
*
|
||||
* @param connectionCostsUrl where to load connection costs resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public ConnectionCosts(URL connectionCostsUrl) throws IOException {
|
||||
this(() -> connectionCostsUrl.openStream());
|
||||
}
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(ConnectionCosts::getClassResource);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ko.dict;
|
|||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.analysis.morph.BinaryDictionary;
|
||||
|
@ -66,6 +67,25 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
|
|||
() -> Files.newInputStream(fstFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with
|
||||
* {@link ClassLoader#getResource(String)}).
|
||||
*
|
||||
* @param targetMapUrl where to load target map resource
|
||||
* @param posDictUrl where to load POS dictionary resource
|
||||
* @param dictUrl where to load dictionary entries resource
|
||||
* @param fstUrl where to load encoded FST data resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl)
|
||||
throws IOException {
|
||||
this(
|
||||
() -> targetMapUrl.openStream(),
|
||||
() -> posDictUrl.openStream(),
|
||||
() -> dictUrl.openStream(),
|
||||
() -> fstUrl.openStream());
|
||||
}
|
||||
|
||||
private TokenInfoDictionary(
|
||||
IOSupplier<InputStream> targetMapResource,
|
||||
IOSupplier<InputStream> posResource,
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko.dict;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.lucene.analysis.morph.BinaryDictionary;
|
||||
|
@ -44,6 +45,20 @@ public final class UnknownDictionary extends BinaryDictionary<UnknownMorphData>
|
|||
() -> Files.newInputStream(dictFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with
|
||||
* {@link ClassLoader#getResource(String)}).
|
||||
*
|
||||
* @param targetMapUrl where to load target map resource
|
||||
* @param posDictUrl where to load POS dictionary resource
|
||||
* @param dictUrl where to load dictionary entries resource
|
||||
* @throws IOException if resource was not found or broken
|
||||
*/
|
||||
public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException {
|
||||
this(
|
||||
() -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream());
|
||||
}
|
||||
|
||||
private UnknownDictionary() throws IOException {
|
||||
this(
|
||||
() -> getClassResource(TARGETMAP_FILENAME_SUFFIX),
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.junit.Before;
|
|||
public class TestExternalDictionary extends LuceneTestCase {
|
||||
|
||||
private Path dir;
|
||||
private ClassLoader loader = getClass().getClassLoader();
|
||||
|
||||
@Override
|
||||
@Before
|
||||
|
@ -97,4 +98,32 @@ public class TestExternalDictionary extends LuceneTestCase {
|
|||
new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
|
||||
assertEquals(0, cc.get(1, 1));
|
||||
}
|
||||
|
||||
public void testLoadExternalUrlTokenInfoDictionary() throws Exception {
|
||||
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
|
||||
TokenInfoDictionary dict =
|
||||
new TokenInfoDictionary(
|
||||
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX));
|
||||
assertNotNull(dict.getFST());
|
||||
}
|
||||
|
||||
public void testLoadExternalUrlUnknownDictionary() throws Exception {
|
||||
String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/');
|
||||
UnknownDictionary dict =
|
||||
new UnknownDictionary(
|
||||
loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX),
|
||||
loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX));
|
||||
assertNotNull(dict.getCharacterDefinition());
|
||||
}
|
||||
|
||||
public void testLoadExternalUrlConnectionCosts() throws Exception {
|
||||
String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/');
|
||||
ConnectionCosts cc =
|
||||
new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX));
|
||||
assertEquals(0, cc.get(1, 1));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue