mirror of https://github.com/apache/lucene.git
LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)
The UkrainianMorfologikAnalyzer was reloading its dictionary every time it created a new TokenStreamComponents, which meant that while the analyzer was open it would hold onto one copy of the dictionary per thread. This commit loads the dictionary in a lazy static initializer, alongside its stopword set. It also makes the normalizer charmap a singleton so that we do not rebuild the same immutable object on every call to initReader.
This commit is contained in:
parent
0c33e621f9
commit
90d363ece7
|
@ -243,6 +243,9 @@ Bug fixes
|
|||
* LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
|
||||
splitting. (Ignacio Vera)
|
||||
|
||||
* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
|
||||
TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
|
||||
|
||||
Changes in Backwards Compatibility Policy
|
||||
|
||||
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
|
||||
|
|
|
@ -42,11 +42,31 @@ import org.apache.lucene.util.IOUtils;
|
|||
* @since 6.2.0
|
||||
*/
|
||||
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Ukrainian stopwords. */
|
||||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
private static final NormalizeCharMap NORMALIZER_MAP;
|
||||
|
||||
static {
|
||||
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
|
||||
// different apostrophes
|
||||
builder.add("\u2019", "'");
|
||||
builder.add("\u2018", "'");
|
||||
builder.add("\u02BC", "'");
|
||||
builder.add("`", "'");
|
||||
builder.add("´", "'");
|
||||
// ignored characters
|
||||
builder.add("\u0301", "");
|
||||
builder.add("\u00AD", "");
|
||||
builder.add("ґ", "г");
|
||||
builder.add("Ґ", "Г");
|
||||
|
||||
NORMALIZER_MAP = builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
*
|
||||
|
@ -57,11 +77,12 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
|
||||
* static final set the first time.;
|
||||
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
static final Dictionary DICTIONARY;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -71,10 +92,15 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
|||
UkrainianMorfologikAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE,
|
||||
StandardCharsets.UTF_8));
|
||||
DICTIONARY =
|
||||
Dictionary.read(
|
||||
UkrainianMorfologikAnalyzer.class
|
||||
.getClassLoader()
|
||||
.getResource("ua/net/nlp/ukrainian.dict"));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new UncheckedIOException("Unable to load default stopword set", ex);
|
||||
throw new UncheckedIOException("Unable to load analyzer resources", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -107,22 +133,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
|
||||
// different apostrophes
|
||||
builder.add("\u2019", "'");
|
||||
builder.add("\u2018", "'");
|
||||
builder.add("\u02BC", "'");
|
||||
builder.add("`", "'");
|
||||
builder.add("´", "'");
|
||||
// ignored characters
|
||||
builder.add("\u0301", "");
|
||||
builder.add("\u00AD", "");
|
||||
builder.add("ґ", "г");
|
||||
builder.add("Ґ", "Г");
|
||||
|
||||
NormalizeCharMap normMap = builder.build();
|
||||
reader = new MappingCharFilter(normMap, reader);
|
||||
return reader;
|
||||
return new MappingCharFilter(NORMALIZER_MAP, reader);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -144,18 +155,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
}
|
||||
|
||||
result = new MorfologikFilter(result, getDictionary());
|
||||
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
private static Dictionary getDictionary() {
|
||||
try {
|
||||
return Dictionary.read(
|
||||
UkrainianMorfologikAnalyzer.class
|
||||
.getClassLoader()
|
||||
.getResource("ua/net/nlp/ukrainian.dict"));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue