mirror of https://github.com/apache/lucene.git
LUCENE-10171: OpenNLPOpsFactory should directly cache DictionaryLemmatizer objects (#380)
Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects. Co-authored-by: Michael Gibney <michael@michaelgibney.net>
This commit is contained in:
parent
7aec489945
commit
8afec33e74
|
@ -103,6 +103,10 @@ API Changes
|
|||
* LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
|
||||
parameters for the various implementations added. (Alan Woodward)
|
||||
|
||||
* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a
|
||||
DictionaryLemmatizer object instead of a raw String serialization of the dictionary.
|
||||
(Spyros Kapnissis via Michael Gibney, Alessandro Benedetti)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||
import opennlp.tools.lemmatizer.LemmatizerME;
|
||||
import opennlp.tools.lemmatizer.LemmatizerModel;
|
||||
|
@ -36,11 +35,11 @@ public class NLPLemmatizerOp {
|
|||
private final DictionaryLemmatizer dictionaryLemmatizer;
|
||||
private final LemmatizerME lemmatizerME;
|
||||
|
||||
public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel)
|
||||
public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel)
|
||||
throws IOException {
|
||||
assert dictionary != null || lemmatizerModel != null
|
||||
assert dictionaryLemmatizer != null || lemmatizerModel != null
|
||||
: "At least one parameter must be non-null";
|
||||
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
|
||||
this.dictionaryLemmatizer = dictionaryLemmatizer;
|
||||
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
|
||||
}
|
||||
|
||||
|
|
|
@ -17,15 +17,12 @@
|
|||
|
||||
package org.apache.lucene.analysis.opennlp.tools;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import opennlp.tools.chunker.ChunkerModel;
|
||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||
import opennlp.tools.lemmatizer.LemmatizerModel;
|
||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
|
@ -45,7 +42,7 @@ public class OpenNLPOpsFactory {
|
|||
private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
|
||||
private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
|
||||
private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
|
||||
private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>();
|
||||
private static Map<String, DictionaryLemmatizer> lemmaDictionaries = new ConcurrentHashMap<>();
|
||||
|
||||
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
|
||||
if (modelName != null) {
|
||||
|
@ -144,36 +141,25 @@ public class OpenNLPOpsFactory {
|
|||
throws IOException {
|
||||
assert dictionaryFile != null || lemmatizerModelFile != null
|
||||
: "At least one parameter must be non-null";
|
||||
InputStream dictionaryInputStream = null;
|
||||
DictionaryLemmatizer dictionaryLemmatizer = null;
|
||||
if (dictionaryFile != null) {
|
||||
String dictionary = lemmaDictionaries.get(dictionaryFile);
|
||||
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
|
||||
dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
|
||||
}
|
||||
LemmatizerModel lemmatizerModel =
|
||||
lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
|
||||
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
|
||||
return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel);
|
||||
}
|
||||
|
||||
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader)
|
||||
throws IOException {
|
||||
String dictionary = lemmaDictionaries.get(dictionaryFile);
|
||||
if (dictionary == null) {
|
||||
try (Reader reader =
|
||||
new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] chars = new char[8092];
|
||||
int numRead = 0;
|
||||
do {
|
||||
numRead = reader.read(chars, 0, chars.length);
|
||||
if (numRead > 0) {
|
||||
builder.append(chars, 0, numRead);
|
||||
public static DictionaryLemmatizer getLemmatizerDictionary(
|
||||
String dictionaryFile, ResourceLoader loader) throws IOException {
|
||||
DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
|
||||
if (dictionaryLemmatizer == null) {
|
||||
// TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding,
|
||||
// so it needs to match the encoding of the dictionary file.
|
||||
dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile));
|
||||
lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer);
|
||||
}
|
||||
} while (numRead > 0);
|
||||
dictionary = builder.toString();
|
||||
lemmaDictionaries.put(dictionaryFile, dictionary);
|
||||
}
|
||||
}
|
||||
return dictionary;
|
||||
return dictionaryLemmatizer;
|
||||
}
|
||||
|
||||
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)
|
||||
|
|
Loading…
Reference in New Issue