From 8afec33e747ec81c2301a4b099bd26b4195a556e Mon Sep 17 00:00:00 2001
From: Spyros Kapnissis <skapni@gmail.com>
Date: Tue, 8 Mar 2022 19:47:16 +0200
Subject: [PATCH] LUCENE-10171: OpenNLPOpsFactory should directly cache
 DictionaryLemmatizer objects (#380)

Instead of caching dictionary strings and building multiple redundant DictionaryLemmatizer objects.

Co-authored-by: Michael Gibney <michael@michaelgibney.net>
---
 lucene/CHANGES.txt                            |  4 ++
 .../opennlp/tools/NLPLemmatizerOp.java        |  7 ++--
 .../opennlp/tools/OpenNLPOpsFactory.java      | 42 +++++++------------
 3 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 105899c54ce..7eddb1f6a8f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -103,6 +103,10 @@ API Changes
 * LUCENE-10431: MultiTermQuery.setRewriteMethod() has been deprecated, and constructor
   parameters for the various implementations added. (Alan Woodward)
 
+* LUCENE-10171: OpenNLPOpsFactory.getLemmatizerDictionary(String, ResourceLoader) now returns a
+  DictionaryLemmatizer object instead of a raw String serialization of the dictionary.
+  (Spyros Kapnissis via Michael Gibney, Alessandro Benedetti)
+
 New Features
 ---------------------
 
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
index de0469aa4a9..ed8f8aa5807 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
@@ -18,7 +18,6 @@
 package org.apache.lucene.analysis.opennlp.tools;
 
 import java.io.IOException;
-import java.io.InputStream;
 import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 import opennlp.tools.lemmatizer.LemmatizerME;
 import opennlp.tools.lemmatizer.LemmatizerModel;
@@ -36,11 +35,11 @@ public class NLPLemmatizerOp {
   private final DictionaryLemmatizer dictionaryLemmatizer;
   private final LemmatizerME lemmatizerME;
 
-  public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel)
+  public NLPLemmatizerOp(DictionaryLemmatizer dictionaryLemmatizer, LemmatizerModel lemmatizerModel)
       throws IOException {
-    assert dictionary != null || lemmatizerModel != null
+    assert dictionaryLemmatizer != null || lemmatizerModel != null
         : "At least one parameter must be non-null";
-    dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
+    this.dictionaryLemmatizer = dictionaryLemmatizer;
     lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
   }
 
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
index 75745bcca75..7458cfd85b0 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
@@ -17,15 +17,12 @@
 
 package org.apache.lucene.analysis.opennlp.tools;
 
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 import opennlp.tools.lemmatizer.LemmatizerModel;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.postag.POSModel;
@@ -45,7 +42,7 @@ public class OpenNLPOpsFactory {
   private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
   private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
   private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
-  private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>();
+  private static Map<String, DictionaryLemmatizer> lemmaDictionaries = new ConcurrentHashMap<>();
 
   public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
     if (modelName != null) {
@@ -144,36 +141,25 @@ public class OpenNLPOpsFactory {
       throws IOException {
     assert dictionaryFile != null || lemmatizerModelFile != null
         : "At least one parameter must be non-null";
-    InputStream dictionaryInputStream = null;
+    DictionaryLemmatizer dictionaryLemmatizer = null;
     if (dictionaryFile != null) {
-      String dictionary = lemmaDictionaries.get(dictionaryFile);
-      dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
+      dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
     }
     LemmatizerModel lemmatizerModel =
         lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
-    return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
+    return new NLPLemmatizerOp(dictionaryLemmatizer, lemmatizerModel);
   }
 
-  public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader)
-      throws IOException {
-    String dictionary = lemmaDictionaries.get(dictionaryFile);
-    if (dictionary == null) {
-      try (Reader reader =
-          new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) {
-        StringBuilder builder = new StringBuilder();
-        char[] chars = new char[8092];
-        int numRead = 0;
-        do {
-          numRead = reader.read(chars, 0, chars.length);
-          if (numRead > 0) {
-            builder.append(chars, 0, numRead);
-          }
-        } while (numRead > 0);
-        dictionary = builder.toString();
-        lemmaDictionaries.put(dictionaryFile, dictionary);
-      }
+  public static DictionaryLemmatizer getLemmatizerDictionary(
+      String dictionaryFile, ResourceLoader loader) throws IOException {
+    DictionaryLemmatizer dictionaryLemmatizer = lemmaDictionaries.get(dictionaryFile);
+    if (dictionaryLemmatizer == null) {
+      // TODO: OpenNLP's DictionaryLemmatizer hardcodes the target platform's system encoding,
+      // so it needs to match the encoding of the dictionary file.
+      dictionaryLemmatizer = new DictionaryLemmatizer(loader.openResource(dictionaryFile));
+      lemmaDictionaries.put(dictionaryFile, dictionaryLemmatizer);
     }
-    return dictionary;
+    return dictionaryLemmatizer;
   }
 
   public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader)