LUCENE-10171: OpenNLPOpsFactory should directly cache DictionaryLemmatizer objects (#380)

Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects. Co-authored-by: Michael Gibney <michael@michaelgibney.net>
2022-03-08 19:47:16 +02:00 · 2022-03-08 19:47:16 +02:00 · 8afec33e74
parent 7aec489945
commit 8afec33e74
3 changed files with 21 additions and 32 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -103,6 +103,10 @@ API Changes
 * LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
  parameters for the various implementations added. (Alan Woodward)

+* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a
+  DictionaryLemmatizer object instead of a raw String serialization of the dictionary.
+  (Spyros Kapnissis via Michael Gibney, Alessandro Benedetti)
+
 New Features
 ---------------------

--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
@ -18,7 +18,6 @@
 package org.apache.lucene.analysis.opennlp.tools;

 import java.io.IOException;
-import java.io.InputStream;
 import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 import opennlp.tools.lemmatizer.LemmatizerME;
 import opennlp.tools.lemmatizer.LemmatizerModel;
@ -36,11 +35,11 @@ public class NLPLemmatizerOp {
  private final DictionaryLemmatizer dictionaryLemmatizer;
  private final LemmatizerME lemmatizerME;

-  public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel)
+  public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel)
      throws IOException {
-    assert dictionary != null || lemmatizerModel != null
+    assert dictionaryLemmatizer != null || lemmatizerModel != null
        : "At least one parameter must be non-null";
-    dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
+    this.dictionaryLemmatizer = dictionaryLemmatizer;
    lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
  }

--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
@ -17,15 +17,12 @@

 package org.apache.lucene.analysis.opennlp.tools;

-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 import opennlp.tools.lemmatizer.LemmatizerModel;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.postag.POSModel;
@ -45,7 +42,7 @@ public class OpenNLPOpsFactory {
  private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
  private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
  private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
-  private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>();
+  private static Map<String, DictionaryLemmatizer> lemmaDictionaries = new ConcurrentHashMap<>();

  public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
    if (modelName != null) {
@ -144,36 +141,25 @@ public class OpenNLPOpsFactory {
      throws IOException {
    assert dictionaryFile != null || lemmatizerModelFile != null
        : "At least one parameter must be non-null";
-    InputStream dictionaryInputStream = null;
+    DictionaryLemmatizer dictionaryLemmatizer = null;
    if (dictionaryFile != null) {
-      String dictionary = lemmaDictionaries.get(dictionaryFile);
-      dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
+      dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
    }
    LemmatizerModel lemmatizerModel =
        lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
-    return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
+    return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel);
  }

-  public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader)
-      throws IOException {
-    String dictionary = lemmaDictionaries.get(dictionaryFile);
-    if (dictionary == null) {
-      try (Reader reader =
-          new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) {
-        StringBuilder builder = new StringBuilder();
-        char[] chars = new char[8092];
-        int numRead = 0;
-        do {
-          numRead = reader.read(chars, 0, chars.length);
-          if (numRead > 0) {
-            builder.append(chars, 0, numRead);
-          }
-        } while (numRead > 0);
-        dictionary = builder.toString();
-        lemmaDictionaries.put(dictionaryFile, dictionary);
-      }
+  public static DictionaryLemmatizer getLemmatizerDictionary(
+      String dictionaryFile, ResourceLoader loader) throws IOException {
+    DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
+    if (dictionaryLemmatizer == null) {
+      // TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding,
+      // so it needs to match the encoding of the dictionary file.
+      dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile));
+      lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer);
    }
-    return dictionary;
+    return dictionaryLemmatizer;
  }

  public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)