mirror of https://github.com/apache/lucene.git
LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)
The UkrainianMorfologikAnalyzer was reloading its dictionary every time it created a new TokenStreamComponents, which meant that while the analyzer was open it would hold onto one copy of the dictionary per thread. This commit loads the dictionary in a lazy static initializer, alongside its stopword set. It also makes the normalizer charmap a singleton so that we do not rebuild the same immutable object on every call to initReader.
This commit is contained in:
parent
0c33e621f9
commit
90d363ece7
|
@ -243,6 +243,9 @@ Bug fixes
|
||||||
* LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
|
* LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
|
||||||
splitting. (Ignacio Vera)
|
splitting. (Ignacio Vera)
|
||||||
|
|
||||||
|
* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
|
||||||
|
TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
|
||||||
|
|
||||||
Changes in Backwards Compatibility Policy
|
Changes in Backwards Compatibility Policy
|
||||||
|
|
||||||
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
|
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
|
||||||
|
|
|
@ -42,11 +42,31 @@ import org.apache.lucene.util.IOUtils;
|
||||||
* @since 6.2.0
|
* @since 6.2.0
|
||||||
*/
|
*/
|
||||||
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
private final CharArraySet stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Ukrainian stopwords. */
|
/** File containing default Ukrainian stopwords. */
|
||||||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
private static final NormalizeCharMap NORMALIZER_MAP;
|
||||||
|
|
||||||
|
static {
|
||||||
|
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
|
||||||
|
// different apostrophes
|
||||||
|
builder.add("\u2019", "'");
|
||||||
|
builder.add("\u2018", "'");
|
||||||
|
builder.add("\u02BC", "'");
|
||||||
|
builder.add("`", "'");
|
||||||
|
builder.add("´", "'");
|
||||||
|
// ignored characters
|
||||||
|
builder.add("\u0301", "");
|
||||||
|
builder.add("\u00AD", "");
|
||||||
|
builder.add("ґ", "г");
|
||||||
|
builder.add("Ґ", "Г");
|
||||||
|
|
||||||
|
NORMALIZER_MAP = builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
*
|
*
|
||||||
|
@ -57,11 +77,12 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
|
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
|
||||||
* static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final CharArraySet DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
static final Dictionary DICTIONARY;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -71,10 +92,15 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
UkrainianMorfologikAnalyzer.class,
|
UkrainianMorfologikAnalyzer.class,
|
||||||
DEFAULT_STOPWORD_FILE,
|
DEFAULT_STOPWORD_FILE,
|
||||||
StandardCharsets.UTF_8));
|
StandardCharsets.UTF_8));
|
||||||
|
DICTIONARY =
|
||||||
|
Dictionary.read(
|
||||||
|
UkrainianMorfologikAnalyzer.class
|
||||||
|
.getClassLoader()
|
||||||
|
.getResource("ua/net/nlp/ukrainian.dict"));
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
// default set should always be present as it is part of the
|
// default set should always be present as it is part of the
|
||||||
// distribution (JAR)
|
// distribution (JAR)
|
||||||
throw new UncheckedIOException("Unable to load default stopword set", ex);
|
throw new UncheckedIOException("Unable to load analyzer resources", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -107,22 +133,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Reader initReader(String fieldName, Reader reader) {
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
|
return new MappingCharFilter(NORMALIZER_MAP, reader);
|
||||||
// different apostrophes
|
|
||||||
builder.add("\u2019", "'");
|
|
||||||
builder.add("\u2018", "'");
|
|
||||||
builder.add("\u02BC", "'");
|
|
||||||
builder.add("`", "'");
|
|
||||||
builder.add("´", "'");
|
|
||||||
// ignored characters
|
|
||||||
builder.add("\u0301", "");
|
|
||||||
builder.add("\u00AD", "");
|
|
||||||
builder.add("ґ", "г");
|
|
||||||
builder.add("Ґ", "Г");
|
|
||||||
|
|
||||||
NormalizeCharMap normMap = builder.build();
|
|
||||||
reader = new MappingCharFilter(normMap, reader);
|
|
||||||
return reader;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -144,18 +155,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||||
}
|
}
|
||||||
|
|
||||||
result = new MorfologikFilter(result, getDictionary());
|
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
|
||||||
return new TokenStreamComponents(source, result);
|
return new TokenStreamComponents(source, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dictionary getDictionary() {
|
|
||||||
try {
|
|
||||||
return Dictionary.read(
|
|
||||||
UkrainianMorfologikAnalyzer.class
|
|
||||||
.getClassLoader()
|
|
||||||
.getResource("ua/net/nlp/ukrainian.dict"));
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue