From 8afec33e747ec81c2301a4b099bd26b4195a556e Mon Sep 17 00:00:00 2001 From: Spyros Kapnissis Date: Tue, 8 Mar 2022 19:47:16 +0200 Subject: [PATCH] LUCENE-10171: OpenNLPOpsFactory should directly cache DictionaryLemmatizer objects (#380) Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects. Co-authored-by: Michael Gibney --- lucene/CHANGES.txt | 4 ++ .../opennlp/tools/NLPLemmatizerOp.java | 7 ++-- .../opennlp/tools/OpenNLPOpsFactory.java | 42 +++++++------------ 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 105899c54ce..7eddb1f6a8f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -103,6 +103,10 @@ API Changes * LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor parameters for the various implementations added. (Alan Woodward) +* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a + DictionaryLemmatizer object instead of a raw String serialization of the dictionary. + (Spyros Kapnissis via Michael Gibney, Alessandro Benedetti) + New Features --------------------- diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java index de0469aa4a9..ed8f8aa5807 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.opennlp.tools; import java.io.IOException; -import java.io.InputStream; import opennlp.tools.lemmatizer.DictionaryLemmatizer; import opennlp.tools.lemmatizer.LemmatizerME; import opennlp.tools.lemmatizer.LemmatizerModel; @@ -36,11 +35,11 @@ public class NLPLemmatizerOp { private final DictionaryLemmatizer dictionaryLemmatizer; private final LemmatizerME lemmatizerME; - public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel) + public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel) throws IOException { - assert dictionary != null || lemmatizerModel != null + assert dictionaryLemmatizer != null || lemmatizerModel != null : "At least one parameter must be non-null"; - dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary); + this.dictionaryLemmatizer = dictionaryLemmatizer; lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel); } diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java index 75745bcca75..7458cfd85b0 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java @@ -17,15 +17,12 @@ package org.apache.lucene.analysis.opennlp.tools; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import opennlp.tools.chunker.ChunkerModel; +import opennlp.tools.lemmatizer.DictionaryLemmatizer; import opennlp.tools.lemmatizer.LemmatizerModel; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.postag.POSModel; @@ -45,7 +42,7 @@ public class OpenNLPOpsFactory { private static ConcurrentHashMap chunkerModels = new ConcurrentHashMap<>(); private static Map nerModels = new ConcurrentHashMap<>(); private static Map lemmatizerModels = new ConcurrentHashMap<>(); - private static Map lemmaDictionaries = new ConcurrentHashMap<>(); + private static Map lemmaDictionaries = new ConcurrentHashMap<>(); public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException { if (modelName != null) { @@ -144,36 +141,25 @@ public class OpenNLPOpsFactory { throws IOException { assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null"; - InputStream dictionaryInputStream = null; + DictionaryLemmatizer dictionaryLemmatizer = null; if (dictionaryFile != null) { - String dictionary = lemmaDictionaries.get(dictionaryFile); - dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8)); + dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile); } LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile); - return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel); + return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel); } - public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) - throws IOException { - String dictionary = lemmaDictionaries.get(dictionaryFile); - if (dictionary == null) { - try (Reader reader = - new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) { - StringBuilder builder = new StringBuilder(); - char[] chars = new char[8092]; - int numRead = 0; - do { - numRead = reader.read(chars, 0, chars.length); - if (numRead > 0) { - builder.append(chars, 0, numRead); - } - } while (numRead > 0); - dictionary = builder.toString(); - lemmaDictionaries.put(dictionaryFile, dictionary); - } + public static DictionaryLemmatizer getLemmatizerDictionary( + String dictionaryFile, ResourceLoader loader) throws IOException { + DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile); + if (dictionaryLemmatizer == null) { + // TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding, + // so it needs to match the encoding of the dictionary file. + dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile)); + lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer); } - return dictionary; + return dictionaryLemmatizer; } public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)