From 8aa4a564913df63c1aaf679fe1474914da6c29aa Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Fri, 6 May 2022 16:49:56 +0200 Subject: [PATCH] LUCENE-10558: Implement URL ctor to support classpath/module usage in Kuromoji and Nori dictionaries (main branch) (#871) --- lucene/CHANGES.txt | 9 ++++++ lucene/MIGRATE.md | 13 +++++++++ .../analysis/ja/dict/ConnectionCosts.java | 12 ++++++++ .../analysis/ja/dict/TokenInfoDictionary.java | 20 +++++++++++++ .../analysis/ja/dict/UnknownDictionary.java | 15 ++++++++++ .../ja/dict/TestExternalDictionary.java | 29 +++++++++++++++++++ .../analysis/ko/dict/ConnectionCosts.java | 12 ++++++++ .../analysis/ko/dict/TokenInfoDictionary.java | 20 +++++++++++++ .../analysis/ko/dict/UnknownDictionary.java | 15 ++++++++++ .../ko/dict/TestExternalDictionary.java | 29 +++++++++++++++++++ 10 files changed, 174 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2ed51c60b63..a5ddd1180a3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -73,6 +73,11 @@ API Changes taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older checkpoints. (Gautam Worah) +* LUCENE-10558: Add new constructors to Kuromoji and Nori dictionary classes to support classpath / + module system usage. It is now possible to use JDK's Class/ClassLoader/Module#getResource(...) apis + and pass their returned URL to dictionary constructors to load resources from Classpath or Module + resources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov) + New Features --------------------- @@ -169,6 +174,10 @@ Bug Fixes * LUCENE-10552: KnnVectorQuery has incorrect equals/ hashCode. (Lu Xugang) +* LUCENE-10558: Restore behaviour of deprecated Kuromoji and Nori dictionary constructors for + custom dictionary support. Please also use new URL-based constructors for classpath/module + system ressources. (Uwe Schindler, Tomoko Uchida, Mike Sokolov) + Build --------------------- diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 79aaf620b8d..d1d1b27f29f 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -64,6 +64,19 @@ the [Log4j JDK Logging Adapter](https://logging.apache.org/log4j/2.x/log4j-jul/i in combination with the corresponding system property: `java.util.logging.manager=org.apache.logging.log4j.jul.LogManager`. +### Kuromoji and Nori analysis component constructors for custom dictionaries + +The Kuromoji and Nori analysis modules had some way to customize the backing dictionaries +by passing a path to file or classpath resources using some inconsistently implemented +APIs. This was buggy from the beginning, but some users made use of it. Due to move to Java +module system, especially the resource lookup on classpath stopped to work correctly. +The Lucene team therefore implemented new APIs to create dictionary implementations +with custom data files. Unfortunately there were some shortcomings in the 9.1 version, +also when using the now deprecated ctors, so users are advised to upgrade to +Lucene 9.2 or stay with 9.0. + +See LUCENE-10558 for more details and workarounds. + ## Migration from Lucene 8.x to Lucene 9.0 ### Rename of binary artifacts from '**-analyzers-**' to '**-analysis-**' (LUCENE-9562) diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java index c11b9ee716a..fcc9223b9ac 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.util.IOSupplier; @@ -36,6 +37,17 @@ public final class ConnectionCosts extends org.apache.lucene.analysis.morph.Conn this(() -> Files.newInputStream(connectionCostsFile)); } + /** + * Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link + * ClassLoader#getResource(String)}). + * + * @param connectionCostsUrl where to load connection costs resource + * @throws IOException if resource was not found or broken + */ + public ConnectionCosts(URL connectionCostsUrl) throws IOException { + this(() -> connectionCostsUrl.openStream()); + } + private ConnectionCosts() throws IOException { this(ConnectionCosts::getClassResource); } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java index c769587829b..ed7b49fb138 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java @@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ja.dict; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.analysis.morph.BinaryDictionary; @@ -58,6 +59,25 @@ public final class TokenInfoDictionary extends BinaryDictionary Files.newInputStream(fstFile)); } + /** + * Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with + * {@link ClassLoader#getResource(String)}). + * + * @param targetMapUrl where to load target map resource + * @param posDictUrl where to load POS dictionary resource + * @param dictUrl where to load dictionary entries resource + * @param fstUrl where to load encoded FST data resource + * @throws IOException if resource was not found or broken + */ + public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl) + throws IOException { + this( + () -> targetMapUrl.openStream(), + () -> posDictUrl.openStream(), + () -> dictUrl.openStream(), + () -> fstUrl.openStream()); + } + private TokenInfoDictionary() throws IOException { this( () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java index b4a9012e3ee..88a2715a137 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionary.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.analysis.morph.BinaryDictionary; @@ -45,6 +46,20 @@ public final class UnknownDictionary extends BinaryDictionary () -> Files.newInputStream(dictFile)); } + /** + * Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with + * {@link ClassLoader#getResource(String)}). + * + * @param targetMapUrl where to load target map resource + * @param posDictUrl where to load POS dictionary resource + * @param dictUrl where to load dictionary entries resource + * @throws IOException if resource was not found or broken + */ + public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException { + this( + () -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream()); + } + private UnknownDictionary() throws IOException { this( () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java index 6aaae0cc7ce..7000a6ada11 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestExternalDictionary.java @@ -31,6 +31,7 @@ import org.junit.Before; public class TestExternalDictionary extends LuceneTestCase { private Path dir; + private ClassLoader loader = getClass().getClassLoader(); @Override @Before @@ -97,4 +98,32 @@ public class TestExternalDictionary extends LuceneTestCase { new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX)); assertEquals(1, cc.get(0, 1)); } + + public void testLoadExternalUrlTokenInfoDictionary() throws Exception { + String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/'); + TokenInfoDictionary dict = + new TokenInfoDictionary( + loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX)); + assertNotNull(dict.getFST()); + } + + public void testLoadExternalUrlUnknownDictionary() throws Exception { + String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/'); + UnknownDictionary dict = + new UnknownDictionary( + loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX)); + assertNotNull(dict.getCharacterDefinition()); + } + + public void testLoadExternalUrlConnectionCosts() throws Exception { + String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/'); + ConnectionCosts cc = + new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX)); + assertEquals(1, cc.get(0, 1)); + } } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java index 3b13a86b90a..f7c74c3f0b4 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko.dict; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.util.IOSupplier; @@ -36,6 +37,17 @@ public final class ConnectionCosts extends org.apache.lucene.analysis.morph.Conn this(() -> Files.newInputStream(connectionCostsFile)); } + /** + * Create a {@link ConnectionCosts} from an external resource URL (e.g. from Classpath with {@link + * ClassLoader#getResource(String)}). + * + * @param connectionCostsUrl where to load connection costs resource + * @throws IOException if resource was not found or broken + */ + public ConnectionCosts(URL connectionCostsUrl) throws IOException { + this(() -> connectionCostsUrl.openStream()); + } + private ConnectionCosts() throws IOException { this(ConnectionCosts::getClassResource); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index b8132cea61b..317123d5b88 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ko.dict; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.analysis.morph.BinaryDictionary; @@ -66,6 +67,25 @@ public final class TokenInfoDictionary extends BinaryDictionary Files.newInputStream(fstFile)); } + /** + * Create a {@link TokenInfoDictionary} from an external resource URL (e.g. from Classpath with + * {@link ClassLoader#getResource(String)}). + * + * @param targetMapUrl where to load target map resource + * @param posDictUrl where to load POS dictionary resource + * @param dictUrl where to load dictionary entries resource + * @param fstUrl where to load encoded FST data resource + * @throws IOException if resource was not found or broken + */ + public TokenInfoDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl, URL fstUrl) + throws IOException { + this( + () -> targetMapUrl.openStream(), + () -> posDictUrl.openStream(), + () -> dictUrl.openStream(), + () -> fstUrl.openStream()); + } + private TokenInfoDictionary( IOSupplier targetMapResource, IOSupplier posResource, diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java index 4b45fd33258..fa8cd21fb38 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionary.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ko.dict; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import org.apache.lucene.analysis.morph.BinaryDictionary; @@ -44,6 +45,20 @@ public final class UnknownDictionary extends BinaryDictionary () -> Files.newInputStream(dictFile)); } + /** + * Create a {@link UnknownDictionary} from an external resource URL (e.g. from Classpath with + * {@link ClassLoader#getResource(String)}). + * + * @param targetMapUrl where to load target map resource + * @param posDictUrl where to load POS dictionary resource + * @param dictUrl where to load dictionary entries resource + * @throws IOException if resource was not found or broken + */ + public UnknownDictionary(URL targetMapUrl, URL posDictUrl, URL dictUrl) throws IOException { + this( + () -> targetMapUrl.openStream(), () -> posDictUrl.openStream(), () -> dictUrl.openStream()); + } + private UnknownDictionary() throws IOException { this( () -> getClassResource(TARGETMAP_FILENAME_SUFFIX), diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java index b8d84f91666..e793c51e37e 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestExternalDictionary.java @@ -31,6 +31,7 @@ import org.junit.Before; public class TestExternalDictionary extends LuceneTestCase { private Path dir; + private ClassLoader loader = getClass().getClassLoader(); @Override @Before @@ -97,4 +98,32 @@ public class TestExternalDictionary extends LuceneTestCase { new ConnectionCosts(dir.resolve(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX)); assertEquals(0, cc.get(1, 1)); } + + public void testLoadExternalUrlTokenInfoDictionary() throws Exception { + String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/'); + TokenInfoDictionary dict = + new TokenInfoDictionary( + loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + FST_FILENAME_SUFFIX)); + assertNotNull(dict.getFST()); + } + + public void testLoadExternalUrlUnknownDictionary() throws Exception { + String dictionaryPath = UnknownDictionary.class.getName().replace('.', '/'); + UnknownDictionary dict = + new UnknownDictionary( + loader.getResource(dictionaryPath + TARGETMAP_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + POSDICT_FILENAME_SUFFIX), + loader.getResource(dictionaryPath + DICT_FILENAME_SUFFIX)); + assertNotNull(dict.getCharacterDefinition()); + } + + public void testLoadExternalUrlConnectionCosts() throws Exception { + String dictionaryPath = ConnectionCosts.class.getName().replace('.', '/'); + ConnectionCosts cc = + new ConnectionCosts(loader.getResource(dictionaryPath + ConnectionCosts.FILENAME_SUFFIX)); + assertEquals(0, cc.get(1, 1)); + } }