mirror of https://github.com/apache/lucene.git
LUCENE-10171: OpenNLPOpsFactory should directly cache DictionaryLemmatizer objects (#380)
Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects. Co-authored-by: Michael Gibney <michael@michaelgibney.net>
This commit is contained in:
parent
7aec489945
commit
8afec33e74
|
@ -103,6 +103,10 @@ API Changes
|
||||||
* LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
|
* LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
|
||||||
parameters for the various implementations added. (Alan Woodward)
|
parameters for the various implementations added. (Alan Woodward)
|
||||||
|
|
||||||
|
* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a
|
||||||
|
DictionaryLemmatizer object instead of a raw String serialization of the dictionary.
|
||||||
|
(Spyros Kapnissis via Michael Gibney, Alessandro Benedetti)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
package org.apache.lucene.analysis.opennlp.tools;
|
package org.apache.lucene.analysis.opennlp.tools;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||||
import opennlp.tools.lemmatizer.LemmatizerME;
|
import opennlp.tools.lemmatizer.LemmatizerME;
|
||||||
import opennlp.tools.lemmatizer.LemmatizerModel;
|
import opennlp.tools.lemmatizer.LemmatizerModel;
|
||||||
|
@ -36,11 +35,11 @@ public class NLPLemmatizerOp {
|
||||||
private final DictionaryLemmatizer dictionaryLemmatizer;
|
private final DictionaryLemmatizer dictionaryLemmatizer;
|
||||||
private final LemmatizerME lemmatizerME;
|
private final LemmatizerME lemmatizerME;
|
||||||
|
|
||||||
public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel)
|
public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert dictionary != null || lemmatizerModel != null
|
assert dictionaryLemmatizer != null || lemmatizerModel != null
|
||||||
: "At least one parameter must be non-null";
|
: "At least one parameter must be non-null";
|
||||||
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
|
this.dictionaryLemmatizer = dictionaryLemmatizer;
|
||||||
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
|
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,15 +17,12 @@
|
||||||
|
|
||||||
package org.apache.lucene.analysis.opennlp.tools;
|
package org.apache.lucene.analysis.opennlp.tools;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import opennlp.tools.chunker.ChunkerModel;
|
import opennlp.tools.chunker.ChunkerModel;
|
||||||
|
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||||
import opennlp.tools.lemmatizer.LemmatizerModel;
|
import opennlp.tools.lemmatizer.LemmatizerModel;
|
||||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
import opennlp.tools.namefind.TokenNameFinderModel;
|
||||||
import opennlp.tools.postag.POSModel;
|
import opennlp.tools.postag.POSModel;
|
||||||
|
@ -45,7 +42,7 @@ public class OpenNLPOpsFactory {
|
||||||
private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
|
private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
|
||||||
private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
|
private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
|
||||||
private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
|
private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
|
||||||
private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>();
|
private static Map<String, DictionaryLemmatizer> lemmaDictionaries = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
|
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
|
||||||
if (modelName != null) {
|
if (modelName != null) {
|
||||||
|
@ -144,36 +141,25 @@ public class OpenNLPOpsFactory {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert dictionaryFile != null || lemmatizerModelFile != null
|
assert dictionaryFile != null || lemmatizerModelFile != null
|
||||||
: "At least one parameter must be non-null";
|
: "At least one parameter must be non-null";
|
||||||
InputStream dictionaryInputStream = null;
|
DictionaryLemmatizer dictionaryLemmatizer = null;
|
||||||
if (dictionaryFile != null) {
|
if (dictionaryFile != null) {
|
||||||
String dictionary = lemmaDictionaries.get(dictionaryFile);
|
dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
|
||||||
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
|
|
||||||
}
|
}
|
||||||
LemmatizerModel lemmatizerModel =
|
LemmatizerModel lemmatizerModel =
|
||||||
lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
|
lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
|
||||||
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
|
return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader)
|
public static DictionaryLemmatizer getLemmatizerDictionary(
|
||||||
throws IOException {
|
String dictionaryFile, ResourceLoader loader) throws IOException {
|
||||||
String dictionary = lemmaDictionaries.get(dictionaryFile);
|
DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
|
||||||
if (dictionary == null) {
|
if (dictionaryLemmatizer == null) {
|
||||||
try (Reader reader =
|
// TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding,
|
||||||
new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) {
|
// so it needs to match the encoding of the dictionary file.
|
||||||
StringBuilder builder = new StringBuilder();
|
dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile));
|
||||||
char[] chars = new char[8092];
|
lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer);
|
||||||
int numRead = 0;
|
|
||||||
do {
|
|
||||||
numRead = reader.read(chars, 0, chars.length);
|
|
||||||
if (numRead > 0) {
|
|
||||||
builder.append(chars, 0, numRead);
|
|
||||||
}
|
|
||||||
} while (numRead > 0);
|
|
||||||
dictionary = builder.toString();
|
|
||||||
lemmaDictionaries.put(dictionaryFile, dictionary);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return dictionary;
|
return dictionaryLemmatizer;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)
|
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)
|
||||||
|
|
Loading…
Reference in New Issue