LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)

The UkrainianMorfologikAnalyzer was reloading its dictionary every time it created a new TokenStreamComponents, which meant that while the analyzer was open it would hold onto one copy of the dictionary per thread. This commit loads the dictionary in a lazy static initializer, alongside its stopword set. It also makes the normalizer charmap a singleton so that we do not rebuild the same immutable object on every call to initReader.
2021-04-28 13:51:23 +01:00 · 2021-04-28 13:51:23 +01:00 · 90d363ece7
parent 0c33e621f9
commit 90d363ece7
2 changed files with 34 additions and 31 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -243,6 +243,9 @@ Bug fixes
 * LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
  splitting. (Ignacio Vera)

+* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
+  TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
+
 Changes in Backwards Compatibility Policy

 * LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
@ -42,11 +42,31 @@ import org.apache.lucene.util.IOUtils;
 * @since 6.2.0
 */
 public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
+
  private final CharArraySet stemExclusionSet;

  /** File containing default Ukrainian stopwords. */
  public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";

+  private static final NormalizeCharMap NORMALIZER_MAP;
+
+  static {
+    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+    // different apostrophes
+    builder.add("\u2019", "'");
+    builder.add("\u2018", "'");
+    builder.add("\u02BC", "'");
+    builder.add("`", "'");
+    builder.add("´", "'");
+    // ignored characters
+    builder.add("\u0301", "");
+    builder.add("\u00AD", "");
+    builder.add("ґ", "г");
+    builder.add("Ґ", "Г");
+
+    NORMALIZER_MAP = builder.build();
+  }
+
  /**
   * Returns an unmodifiable instance of the default stop words set.
   *
@ -57,11 +77,12 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
  }

  /**
-   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
-   * static final set the first time.;
+   * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
+   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final CharArraySet DEFAULT_STOP_SET;
+    static final Dictionary DICTIONARY;

    static {
      try {
@ -71,10 +92,15 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
                    UkrainianMorfologikAnalyzer.class,
                    DEFAULT_STOPWORD_FILE,
                    StandardCharsets.UTF_8));
+        DICTIONARY =
+            Dictionary.read(
+                UkrainianMorfologikAnalyzer.class
+                    .getClassLoader()
+                    .getResource("ua/net/nlp/ukrainian.dict"));
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
-        throw new UncheckedIOException("Unable to load default stopword set", ex);
+        throw new UncheckedIOException("Unable to load analyzer resources", ex);
      }
    }
  }
@ -107,22 +133,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {

  @Override
  protected Reader initReader(String fieldName, Reader reader) {
-    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
-    // different apostrophes
-    builder.add("\u2019", "'");
-    builder.add("\u2018", "'");
-    builder.add("\u02BC", "'");
-    builder.add("`", "'");
-    builder.add("´", "'");
-    // ignored characters
-    builder.add("\u0301", "");
-    builder.add("\u00AD", "");
-    builder.add("ґ", "г");
-    builder.add("Ґ", "Г");
-
-    NormalizeCharMap normMap = builder.build();
-    reader = new MappingCharFilter(normMap, reader);
-    return reader;
+    return new MappingCharFilter(NORMALIZER_MAP, reader);
  }

  /**
@ -144,18 +155,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

-    result = new MorfologikFilter(result, getDictionary());
+    result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
    return new TokenStreamComponents(source, result);
  }
-
-  private static Dictionary getDictionary() {
-    try {
-      return Dictionary.read(
-          UkrainianMorfologikAnalyzer.class
-              .getClassLoader()
-              .getResource("ua/net/nlp/ukrainian.dict"));
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
 }