LUCENE-10171: OpenNLPOpsFactory should directly cache DictionaryLemmatizer objects (#380)

Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects.

Co-authored-by: Michael Gibney <michael@michaelgibney.net>
This commit is contained in:
Spyros Kapnissis 2022-03-08 19:47:16 +02:00 committed by GitHub
parent 7aec489945
commit 8afec33e74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 32 deletions

View File

@ -103,6 +103,10 @@ API Changes
* LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
parameters for the various implementations added. (Alan Woodward)
* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a
DictionaryLemmatizer object instead of a raw String serialization of the dictionary.
(Spyros Kapnissis via Michael Gibney, Alessandro Benedetti)
New Features
---------------------

View File

@ -18,7 +18,6 @@
package org.apache.lucene.analysis.opennlp.tools;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
@ -36,11 +35,11 @@ public class NLPLemmatizerOp {
private final DictionaryLemmatizer dictionaryLemmatizer;
private final LemmatizerME lemmatizerME;
public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel)
public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel)
throws IOException {
assert dictionary != null || lemmatizerModel != null
assert dictionaryLemmatizer != null || lemmatizerModel != null
: "At least one parameter must be non-null";
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
this.dictionaryLemmatizer = dictionaryLemmatizer;
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
}

View File

@ -17,15 +17,12 @@
package org.apache.lucene.analysis.opennlp.tools;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
@ -45,7 +42,7 @@ public class OpenNLPOpsFactory {
private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>();
private static Map<String, DictionaryLemmatizer> lemmaDictionaries = new ConcurrentHashMap<>();
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
if (modelName != null) {
@ -144,36 +141,25 @@ public class OpenNLPOpsFactory {
throws IOException {
assert dictionaryFile != null || lemmatizerModelFile != null
: "At least one parameter must be non-null";
InputStream dictionaryInputStream = null;
DictionaryLemmatizer dictionaryLemmatizer = null;
if (dictionaryFile != null) {
String dictionary = lemmaDictionaries.get(dictionaryFile);
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
}
LemmatizerModel lemmatizerModel =
lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel);
}
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader)
throws IOException {
String dictionary = lemmaDictionaries.get(dictionaryFile);
if (dictionary == null) {
try (Reader reader =
new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) {
StringBuilder builder = new StringBuilder();
char[] chars = new char[8092];
int numRead = 0;
do {
numRead = reader.read(chars, 0, chars.length);
if (numRead > 0) {
builder.append(chars, 0, numRead);
public static DictionaryLemmatizer getLemmatizerDictionary(
String dictionaryFile, ResourceLoader loader) throws IOException {
DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
if (dictionaryLemmatizer == null) {
// TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding,
// so it needs to match the encoding of the dictionary file.
dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile));
lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer);
}
} while (numRead > 0);
dictionary = builder.toString();
lemmaDictionaries.put(dictionaryFile, dictionary);
}
}
return dictionary;
return dictionaryLemmatizer;
}
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)