LUCENE-10300: rewrite how resources are read in ukrainian morfologik analyzer (module vs. classpath lookup).

This commit is contained in:
Dawid Weiss 2021-12-10 14:02:43 +01:00
parent 768adb99d6
commit aee191d878
2 changed files with 63 additions and 41 deletions

View File

@ -20,7 +20,6 @@ import org.apache.lucene.analysis.morfologik.MorfologikAnalyzer;
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
public class TestMorfologikAnalyzer { public class TestMorfologikAnalyzer {
@ -31,7 +30,6 @@ public class TestMorfologikAnalyzer {
} }
@Test @Test
@Ignore("LUCENE-10300: Awaits fix - does not work in module mode.")
public void testUkrainianMorfologikAnalyzerLoads() { public void testUkrainianMorfologikAnalyzerLoads() {
var analyzer = new UkrainianMorfologikAnalyzer(); var analyzer = new UkrainianMorfologikAnalyzer();
Assert.assertNotNull(analyzer); Assert.assertNotNull(analyzer);

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.io.UncheckedIOException; import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Objects;
import morfologik.stemming.Dictionary; import morfologik.stemming.Dictionary;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
@ -42,12 +43,9 @@ import org.apache.lucene.util.IOUtils;
* @since 6.2.0 * @since 6.2.0
*/ */
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
private final Dictionary dictionary;
private final CharArraySet stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Ukrainian stopwords. */
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final NormalizeCharMap NORMALIZER_MAP; private static final NormalizeCharMap NORMALIZER_MAP;
static { static {
@ -67,47 +65,72 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
NORMALIZER_MAP = builder.build(); NORMALIZER_MAP = builder.build();
} }
/** /** Returns a lazy singleton with the default Ukrainian resources. */
* Returns an unmodifiable instance of the default stop words set. private static volatile DefaultResources defaultResources;
*
* @return default stop words set. private static DefaultResources getDefaultResources() {
*/ if (defaultResources == null) {
public static CharArraySet getDefaultStopSet() { synchronized (DefaultResources.class) {
return DefaultSetHolder.DEFAULT_STOP_SET; try {
CharArraySet wordList;
try (var is = UkrainianMorfologikAnalyzer.class.getResourceAsStream("stopwords.txt")) {
if (is == null) {
throw new IOException("Could not locate the required stopwords resource.");
}
wordList =
WordlistLoader.getSnowballWordSet(
IOUtils.getDecodingReader(is, StandardCharsets.UTF_8));
} }
/** // First, try to look up the resource module by name.
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class Dictionary dictionary;
* accesses the static final set the first time.; Module ourModule = DefaultResources.class.getModule();
*/ if (ourModule.isNamed() && ourModule.getLayer() != null) {
private static class DefaultSetHolder { var module =
static final CharArraySet DEFAULT_STOP_SET; ourModule
static final Dictionary DICTIONARY; .getLayer()
.findModule("morfologik.ukrainian.search")
.orElseThrow(
() ->
new IOException(
"Can't find the resource module: morfologik.ukrainian.search"));
static { try (var fsaStream = module.getResourceAsStream("ua/net/nlp/ukrainian.dict");
try { var metaStream = module.getResourceAsStream("ua/net/nlp/ukrainian.info")) {
DEFAULT_STOP_SET = dictionary = Dictionary.read(fsaStream, metaStream);
WordlistLoader.getSnowballWordSet( }
IOUtils.getDecodingReader( } else {
UkrainianMorfologikAnalyzer.class, dictionary =
DEFAULT_STOPWORD_FILE,
StandardCharsets.UTF_8));
DICTIONARY =
Dictionary.read( Dictionary.read(
Objects.requireNonNull(
UkrainianMorfologikAnalyzer.class UkrainianMorfologikAnalyzer.class
.getClassLoader() .getClassLoader()
.getResource("ua/net/nlp/ukrainian.dict")); .getResource("ua/net/nlp/ukrainian.dict"),
} catch (IOException ex) { "Could not locate the required Ukrainian dictionary resource."));
// default set should always be present as it is part of the
// distribution (JAR)
throw new UncheckedIOException("Unable to load analyzer resources", ex);
} }
defaultResources = new DefaultResources(wordList, dictionary);
} catch (IOException e) {
throw new UncheckedIOException(
"Could not load the required resources for the Ukrainian analyzer.", e);
}
}
}
return defaultResources;
}
private static class DefaultResources {
final CharArraySet stopSet;
final Dictionary dictionary;
private DefaultResources(CharArraySet stopSet, Dictionary dictionary) {
this.stopSet = stopSet;
this.dictionary = dictionary;
} }
} }
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ /** Builds an analyzer with the default stop words. */
public UkrainianMorfologikAnalyzer() { public UkrainianMorfologikAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(getDefaultResources().stopSet);
} }
/** /**
@ -129,6 +152,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
this.dictionary = getDefaultResources().dictionary;
} }
@Override @Override
@ -155,7 +179,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
} }
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY); result = new MorfologikFilter(result, dictionary);
return new TokenStreamComponents(source, result); return new TokenStreamComponents(source, result);
} }
} }