LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)

The UkrainianMorfologikAnalyzer was reloading its dictionary every
time it created a new TokenStreamComponents, which meant that
while the analyzer was open it would hold onto one copy of the
dictionary per thread.

This commit loads the dictionary in a lazy static initializer, alongside
its stopword set. It also makes the normalizer charmap a singleton
so that we do not rebuild the same immutable object on every call
to initReader.
This commit is contained in:
Alan Woodward 2021-04-28 13:51:23 +01:00 committed by GitHub
parent 0c33e621f9
commit 90d363ece7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 31 deletions

View File

@ -243,6 +243,9 @@ Bug fixes
* LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
splitting. (Ignacio Vera)
* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
Changes in Backwards Compatibility Policy
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top

View File

@ -42,11 +42,31 @@ import org.apache.lucene.util.IOUtils;
* @since 6.2.0
*/
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
/** File containing default Ukrainian stopwords. */
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final NormalizeCharMap NORMALIZER_MAP;
static {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// different apostrophes
builder.add("\u2019", "'");
builder.add("\u2018", "'");
builder.add("\u02BC", "'");
builder.add("`", "'");
builder.add("´", "'");
// ignored characters
builder.add("\u0301", "");
builder.add("\u00AD", "");
builder.add("ґ", "г");
builder.add("Ґ", "Г");
NORMALIZER_MAP = builder.build();
}
/**
* Returns an unmodifiable instance of the default stop words set.
*
@ -57,11 +77,12 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
* static final set the first time.;
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static final Dictionary DICTIONARY;
static {
try {
@ -71,10 +92,15 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
UkrainianMorfologikAnalyzer.class,
DEFAULT_STOPWORD_FILE,
StandardCharsets.UTF_8));
DICTIONARY =
Dictionary.read(
UkrainianMorfologikAnalyzer.class
.getClassLoader()
.getResource("ua/net/nlp/ukrainian.dict"));
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new UncheckedIOException("Unable to load default stopword set", ex);
throw new UncheckedIOException("Unable to load analyzer resources", ex);
}
}
}
@ -107,22 +133,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
@Override
protected Reader initReader(String fieldName, Reader reader) {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// different apostrophes
builder.add("\u2019", "'");
builder.add("\u2018", "'");
builder.add("\u02BC", "'");
builder.add("`", "'");
builder.add("´", "'");
// ignored characters
builder.add("\u0301", "");
builder.add("\u00AD", "");
builder.add("ґ", "г");
builder.add("Ґ", "Г");
NormalizeCharMap normMap = builder.build();
reader = new MappingCharFilter(normMap, reader);
return reader;
return new MappingCharFilter(NORMALIZER_MAP, reader);
}
/**
@ -144,18 +155,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
result = new MorfologikFilter(result, getDictionary());
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
return new TokenStreamComponents(source, result);
}
private static Dictionary getDictionary() {
try {
return Dictionary.read(
UkrainianMorfologikAnalyzer.class
.getClassLoader()
.getResource("ua/net/nlp/ukrainian.dict"));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}