From aee191d878835e9c0e7d20206bd5310f1cbb114e Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 10 Dec 2021 14:02:43 +0100 Subject: [PATCH] LUCENE-10300: rewrite how resources are read in ukrainian morfologik analyzer (module vs. classpath lookup). --- .../tests/TestMorfologikAnalyzer.java | 2 - .../uk/UkrainianMorfologikAnalyzer.java | 102 +++++++++++------- 2 files changed, 63 insertions(+), 41 deletions(-) diff --git a/lucene/analysis/morfologik.tests/src/test/org/apache/lucene/analysis/morfologik/tests/TestMorfologikAnalyzer.java b/lucene/analysis/morfologik.tests/src/test/org/apache/lucene/analysis/morfologik/tests/TestMorfologikAnalyzer.java index 484462eeae3..c2933fc7f28 100644 --- a/lucene/analysis/morfologik.tests/src/test/org/apache/lucene/analysis/morfologik/tests/TestMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik.tests/src/test/org/apache/lucene/analysis/morfologik/tests/TestMorfologikAnalyzer.java @@ -20,7 +20,6 @@ import org.apache.lucene.analysis.morfologik.MorfologikAnalyzer; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; import org.apache.lucene.index.IndexWriter; import org.junit.Assert; -import org.junit.Ignore; import org.junit.Test; public class TestMorfologikAnalyzer { @@ -31,7 +30,6 @@ public class TestMorfologikAnalyzer { } @Test - @Ignore("LUCENE-10300: Awaits fix - does not work in module mode.") public void testUkrainianMorfologikAnalyzerLoads() { var analyzer = new UkrainianMorfologikAnalyzer(); Assert.assertNotNull(analyzer); diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java index 425a55c6e63..b80ccb69f6c 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.Reader; import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; +import java.util.Objects; import morfologik.stemming.Dictionary; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; @@ -42,12 +43,9 @@ import org.apache.lucene.util.IOUtils; * @since 6.2.0 */ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { - + private final Dictionary dictionary; private final CharArraySet stemExclusionSet; - /** File containing default Ukrainian stopwords. */ - public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; - private static final NormalizeCharMap NORMALIZER_MAP; static { @@ -67,47 +65,72 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { NORMALIZER_MAP = builder.build(); } - /** - * Returns an unmodifiable instance of the default stop words set. - * - * @return default stop words set. - */ - public static CharArraySet getDefaultStopSet() { - return DefaultSetHolder.DEFAULT_STOP_SET; + /** Returns a lazy singleton with the default Ukrainian resources. */ + private static volatile DefaultResources defaultResources; + + private static DefaultResources getDefaultResources() { + if (defaultResources == null) { + synchronized (DefaultResources.class) { + try { + CharArraySet wordList; + try (var is = UkrainianMorfologikAnalyzer.class.getResourceAsStream("stopwords.txt")) { + if (is == null) { + throw new IOException("Could not locate the required stopwords resource."); + } + wordList = + WordlistLoader.getSnowballWordSet( + IOUtils.getDecodingReader(is, StandardCharsets.UTF_8)); + } + + // First, try to look up the resource module by name. + Dictionary dictionary; + Module ourModule = DefaultResources.class.getModule(); + if (ourModule.isNamed() && ourModule.getLayer() != null) { + var module = + ourModule + .getLayer() + .findModule("morfologik.ukrainian.search") + .orElseThrow( + () -> + new IOException( + "Can't find the resource module: morfologik.ukrainian.search")); + + try (var fsaStream = module.getResourceAsStream("ua/net/nlp/ukrainian.dict"); + var metaStream = module.getResourceAsStream("ua/net/nlp/ukrainian.info")) { + dictionary = Dictionary.read(fsaStream, metaStream); + } + } else { + dictionary = + Dictionary.read( + Objects.requireNonNull( + UkrainianMorfologikAnalyzer.class + .getClassLoader() + .getResource("ua/net/nlp/ukrainian.dict"), + "Could not locate the required Ukrainian dictionary resource.")); + } + defaultResources = new DefaultResources(wordList, dictionary); + } catch (IOException e) { + throw new UncheckedIOException( + "Could not load the required resources for the Ukrainian analyzer.", e); + } + } + } + return defaultResources; } - /** - * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class - * accesses the static final set the first time.; - */ - private static class DefaultSetHolder { - static final CharArraySet DEFAULT_STOP_SET; - static final Dictionary DICTIONARY; + private static class DefaultResources { + final CharArraySet stopSet; + final Dictionary dictionary; - static { - try { - DEFAULT_STOP_SET = - WordlistLoader.getSnowballWordSet( - IOUtils.getDecodingReader( - UkrainianMorfologikAnalyzer.class, - DEFAULT_STOPWORD_FILE, - StandardCharsets.UTF_8)); - DICTIONARY = - Dictionary.read( - UkrainianMorfologikAnalyzer.class - .getClassLoader() - .getResource("ua/net/nlp/ukrainian.dict")); - } catch (IOException ex) { - // default set should always be present as it is part of the - // distribution (JAR) - throw new UncheckedIOException("Unable to load analyzer resources", ex); - } + private DefaultResources(CharArraySet stopSet, Dictionary dictionary) { + this.stopSet = stopSet; + this.dictionary = dictionary; } } - /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ + /** Builds an analyzer with the default stop words. */ public UkrainianMorfologikAnalyzer() { - this(DefaultSetHolder.DEFAULT_STOP_SET); + this(getDefaultResources().stopSet); } /** @@ -129,6 +152,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { super(stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); + this.dictionary = getDefaultResources().dictionary; } @Override @@ -155,7 +179,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } - result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY); + result = new MorfologikFilter(result, dictionary); return new TokenStreamComponents(source, result); } }