LUCENE-10171: OpenNLPOpsFactory should directly cache DictionaryLemmatizer objects (#380)

Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects.

Co-authored-by: Michael Gibney <michael@michaelgibney.net>
This commit is contained in:
Spyros Kapnissis 2022-03-08 19:47:16 +02:00 committed by GitHub
parent 7aec489945
commit 8afec33e74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 32 deletions

View File

@ -103,6 +103,10 @@ API Changes
* LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor * LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
parameters for the various implementations added. (Alan Woodward) parameters for the various implementations added. (Alan Woodward)
* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a
DictionaryLemmatizer object instead of a raw String serialization of the dictionary.
(Spyros Kapnissis via Michael Gibney, Alessandro Benedetti)
New Features New Features
--------------------- ---------------------

View File

@ -18,7 +18,6 @@
package org.apache.lucene.analysis.opennlp.tools; package org.apache.lucene.analysis.opennlp.tools;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.lemmatizer.DictionaryLemmatizer; import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.lemmatizer.LemmatizerME; import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel; import opennlp.tools.lemmatizer.LemmatizerModel;
@ -36,11 +35,11 @@ public class NLPLemmatizerOp {
private final DictionaryLemmatizer dictionaryLemmatizer; private final DictionaryLemmatizer dictionaryLemmatizer;
private final LemmatizerME lemmatizerME; private final LemmatizerME lemmatizerME;
public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel) public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel)
throws IOException { throws IOException {
assert dictionary != null || lemmatizerModel != null assert dictionaryLemmatizer != null || lemmatizerModel != null
: "At least one parameter must be non-null"; : "At least one parameter must be non-null";
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary); this.dictionaryLemmatizer = dictionaryLemmatizer;
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel); lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
} }

View File

@ -17,15 +17,12 @@
package org.apache.lucene.analysis.opennlp.tools; package org.apache.lucene.analysis.opennlp.tools;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.lemmatizer.LemmatizerModel; import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSModel;
@ -45,7 +42,7 @@ public class OpenNLPOpsFactory {
private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>(); private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>(); private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>(); private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>(); private static Map<String, DictionaryLemmatizer> lemmaDictionaries = new ConcurrentHashMap<>();
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException { public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
if (modelName != null) { if (modelName != null) {
@ -144,36 +141,25 @@ public class OpenNLPOpsFactory {
throws IOException { throws IOException {
assert dictionaryFile != null || lemmatizerModelFile != null assert dictionaryFile != null || lemmatizerModelFile != null
: "At least one parameter must be non-null"; : "At least one parameter must be non-null";
InputStream dictionaryInputStream = null; DictionaryLemmatizer dictionaryLemmatizer = null;
if (dictionaryFile != null) { if (dictionaryFile != null) {
String dictionary = lemmaDictionaries.get(dictionaryFile); dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
} }
LemmatizerModel lemmatizerModel = LemmatizerModel lemmatizerModel =
lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile); lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel); return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel);
} }
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) public static DictionaryLemmatizer getLemmatizerDictionary(
throws IOException { String dictionaryFile, ResourceLoader loader) throws IOException {
String dictionary = lemmaDictionaries.get(dictionaryFile); DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
if (dictionary == null) { if (dictionaryLemmatizer == null) {
try (Reader reader = // TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding,
new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) { // so it needs to match the encoding of the dictionary file.
StringBuilder builder = new StringBuilder(); dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile));
char[] chars = new char[8092]; lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer);
int numRead = 0;
do {
numRead = reader.read(chars, 0, chars.length);
if (numRead > 0) {
builder.append(chars, 0, numRead);
}
} while (numRead > 0);
dictionary = builder.toString();
lemmaDictionaries.put(dictionaryFile, dictionary);
}
} }
return dictionary; return dictionaryLemmatizer;
} }
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)