From 90d363ece7116954c530a74a014487fcbdee7610 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 28 Apr 2021 13:51:23 +0100 Subject: [PATCH] LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109) The UkrainianMorfologikAnalyzer was reloading its dictionary every time it created a new TokenStreamComponents, which meant that while the analyzer was open it would hold onto one copy of the dictionary per thread. This commit loads the dictionary in a lazy static initializer, alongside its stopword set. It also makes the normalizer charmap a singleton so that we do not rebuild the same immutable object on every call to initReader. --- lucene/CHANGES.txt | 3 + .../uk/UkrainianMorfologikAnalyzer.java | 62 +++++++++---------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 59e678cf378..d98c5bc4bbf 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -243,6 +243,9 @@ Bug fixes * LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon splitting. (Ignacio Vera) +* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new + TokenStreamComponents, which could lead to memory leaks. (Alan Woodward) + Changes in Backwards Compatibility Policy * LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java index ae2b78c38df..425a55c6e63 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java @@ -42,11 +42,31 @@ import org.apache.lucene.util.IOUtils; * @since 6.2.0 */ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { + private final CharArraySet stemExclusionSet; /** File containing default Ukrainian stopwords. */ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + private static final NormalizeCharMap NORMALIZER_MAP; + + static { + NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); + // different apostrophes + builder.add("\u2019", "'"); + builder.add("\u2018", "'"); + builder.add("\u02BC", "'"); + builder.add("`", "'"); + builder.add("´", "'"); + // ignored characters + builder.add("\u0301", ""); + builder.add("\u00AD", ""); + builder.add("ґ", "г"); + builder.add("Ґ", "Г"); + + NORMALIZER_MAP = builder.build(); + } + /** * Returns an unmodifiable instance of the default stop words set. * @@ -57,11 +77,12 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { } /** - * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the - * static final set the first time.; + * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class + * accesses the static final set the first time.; */ private static class DefaultSetHolder { static final CharArraySet DEFAULT_STOP_SET; + static final Dictionary DICTIONARY; static { try { @@ -71,10 +92,15 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { UkrainianMorfologikAnalyzer.class, DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); + DICTIONARY = + Dictionary.read( + UkrainianMorfologikAnalyzer.class + .getClassLoader() + .getResource("ua/net/nlp/ukrainian.dict")); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) - throw new UncheckedIOException("Unable to load default stopword set", ex); + throw new UncheckedIOException("Unable to load analyzer resources", ex); } } } @@ -107,22 +133,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { @Override protected Reader initReader(String fieldName, Reader reader) { - NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); - // different apostrophes - builder.add("\u2019", "'"); - builder.add("\u2018", "'"); - builder.add("\u02BC", "'"); - builder.add("`", "'"); - builder.add("´", "'"); - // ignored characters - builder.add("\u0301", ""); - builder.add("\u00AD", ""); - builder.add("ґ", "г"); - builder.add("Ґ", "Г"); - - NormalizeCharMap normMap = builder.build(); - reader = new MappingCharFilter(normMap, reader); - return reader; + return new MappingCharFilter(NORMALIZER_MAP, reader); } /** @@ -144,18 +155,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } - result = new MorfologikFilter(result, getDictionary()); + result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY); return new TokenStreamComponents(source, result); } - - private static Dictionary getDictionary() { - try { - return Dictionary.read( - UkrainianMorfologikAnalyzer.class - .getClassLoader() - .getResource("ua/net/nlp/ukrainian.dict")); - } catch (IOException e) { - throw new RuntimeException(e); - } - } }