LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)

The UkrainianMorfologikAnalyzer was reloading its dictionary every time it created a new TokenStreamComponents, which meant that while the analyzer was open it would hold onto one copy of the dictionary per thread. This commit loads the dictionary in a lazy static initializer, alongside its stopword set. It also makes the normalizer charmap a singleton so that we do not rebuild the same immutable object on every call to initReader.
2021-04-28 13:51:23 +01:00 · 2021-04-28 13:51:23 +01:00 · 90d363ece7
parent 0c33e621f9
commit 90d363ece7
2 changed files with 34 additions and 31 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -243,6 +243,9 @@ Bug fixes
 * LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
  splitting. (Ignacio Vera)
 * LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
  TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
 Changes in Backwards Compatibility Policy
 * LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
@ -42,11 +42,31 @@ import org.apache.lucene.util.IOUtils;
 * @since 6.2.0
 */
 public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
  private final CharArraySet stemExclusionSet;
  /** File containing default Ukrainian stopwords. */
  public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  private static final NormalizeCharMap NORMALIZER_MAP;
  static {
    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    // different apostrophes
    builder.add("\u2019", "'");
    builder.add("\u2018", "'");
    builder.add("\u02BC", "'");
    builder.add("`", "'");
    builder.add("´", "'");
    // ignored characters
    builder.add("\u0301", "");
    builder.add("\u00AD", "");
    builder.add("ґ", "г");
    builder.add("Ґ", "Г");
    NORMALIZER_MAP = builder.build();
  }
  /**
   * Returns an unmodifiable instance of the default stop words set.
   *
@ -57,11 +77,12 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
  }
  /**
-   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
+   * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
-   * static final set the first time.;
+   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final CharArraySet DEFAULT_STOP_SET;
    static final Dictionary DICTIONARY;
    static {
      try {
@ -71,10 +92,15 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
                    UkrainianMorfologikAnalyzer.class,
                    DEFAULT_STOPWORD_FILE,
                    StandardCharsets.UTF_8));
        DICTIONARY =
            Dictionary.read(
                UkrainianMorfologikAnalyzer.class
                    .getClassLoader()
                    .getResource("ua/net/nlp/ukrainian.dict"));
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
-        throw new UncheckedIOException("Unable to load default stopword set", ex);
+        throw new UncheckedIOException("Unable to load analyzer resources", ex);
      }
    }
  }
@ -107,22 +133,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
  @Override
  protected Reader initReader(String fieldName, Reader reader) {
-    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+    return new MappingCharFilter(NORMALIZER_MAP, reader);
    // different apostrophes
    builder.add("\u2019", "'");
    builder.add("\u2018", "'");
    builder.add("\u02BC", "'");
    builder.add("`", "'");
    builder.add("´", "'");
    // ignored characters
    builder.add("\u0301", "");
    builder.add("\u00AD", "");
    builder.add("ґ", "г");
    builder.add("Ґ", "Г");
    NormalizeCharMap normMap = builder.build();
    reader = new MappingCharFilter(normMap, reader);
    return reader;
  }
  /**
@ -144,18 +155,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }
-    result = new MorfologikFilter(result, getDictionary());
+    result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
    return new TokenStreamComponents(source, result);
  }
  private static Dictionary getDictionary() {
    try {
      return Dictionary.read(
          UkrainianMorfologikAnalyzer.class
              .getClassLoader()
              .getResource("ua/net/nlp/ukrainian.dict"));
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
 }