mirror of https://github.com/apache/lucene.git
LUCENE-10300: rewrite how resources are read in ukrainian morfologik analyzer (module vs. classpath lookup).
This commit is contained in:
parent
768adb99d6
commit
aee191d878
|
@ -20,7 +20,6 @@ import org.apache.lucene.analysis.morfologik.MorfologikAnalyzer;
|
||||||
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
|
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestMorfologikAnalyzer {
|
public class TestMorfologikAnalyzer {
|
||||||
|
@ -31,7 +30,6 @@ public class TestMorfologikAnalyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore("LUCENE-10300: Awaits fix - does not work in module mode.")
|
|
||||||
public void testUkrainianMorfologikAnalyzerLoads() {
|
public void testUkrainianMorfologikAnalyzerLoads() {
|
||||||
var analyzer = new UkrainianMorfologikAnalyzer();
|
var analyzer = new UkrainianMorfologikAnalyzer();
|
||||||
Assert.assertNotNull(analyzer);
|
Assert.assertNotNull(analyzer);
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.UncheckedIOException;
|
import java.io.UncheckedIOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Objects;
|
||||||
import morfologik.stemming.Dictionary;
|
import morfologik.stemming.Dictionary;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
@ -42,12 +43,9 @@ import org.apache.lucene.util.IOUtils;
|
||||||
* @since 6.2.0
|
* @since 6.2.0
|
||||||
*/
|
*/
|
||||||
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Dictionary dictionary;
|
||||||
private final CharArraySet stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Ukrainian stopwords. */
|
|
||||||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
|
||||||
|
|
||||||
private static final NormalizeCharMap NORMALIZER_MAP;
|
private static final NormalizeCharMap NORMALIZER_MAP;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
@ -67,47 +65,72 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
NORMALIZER_MAP = builder.build();
|
NORMALIZER_MAP = builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Returns a lazy singleton with the default Ukrainian resources. */
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
private static volatile DefaultResources defaultResources;
|
||||||
*
|
|
||||||
* @return default stop words set.
|
private static DefaultResources getDefaultResources() {
|
||||||
*/
|
if (defaultResources == null) {
|
||||||
public static CharArraySet getDefaultStopSet() {
|
synchronized (DefaultResources.class) {
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
try {
|
||||||
|
CharArraySet wordList;
|
||||||
|
try (var is = UkrainianMorfologikAnalyzer.class.getResourceAsStream("stopwords.txt")) {
|
||||||
|
if (is == null) {
|
||||||
|
throw new IOException("Could not locate the required stopwords resource.");
|
||||||
|
}
|
||||||
|
wordList =
|
||||||
|
WordlistLoader.getSnowballWordSet(
|
||||||
|
IOUtils.getDecodingReader(is, StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// First, try to look up the resource module by name.
|
||||||
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
|
Dictionary dictionary;
|
||||||
* accesses the static final set the first time.;
|
Module ourModule = DefaultResources.class.getModule();
|
||||||
*/
|
if (ourModule.isNamed() && ourModule.getLayer() != null) {
|
||||||
private static class DefaultSetHolder {
|
var module =
|
||||||
static final CharArraySet DEFAULT_STOP_SET;
|
ourModule
|
||||||
static final Dictionary DICTIONARY;
|
.getLayer()
|
||||||
|
.findModule("morfologik.ukrainian.search")
|
||||||
|
.orElseThrow(
|
||||||
|
() ->
|
||||||
|
new IOException(
|
||||||
|
"Can't find the resource module: morfologik.ukrainian.search"));
|
||||||
|
|
||||||
static {
|
try (var fsaStream = module.getResourceAsStream("ua/net/nlp/ukrainian.dict");
|
||||||
try {
|
var metaStream = module.getResourceAsStream("ua/net/nlp/ukrainian.info")) {
|
||||||
DEFAULT_STOP_SET =
|
dictionary = Dictionary.read(fsaStream, metaStream);
|
||||||
WordlistLoader.getSnowballWordSet(
|
}
|
||||||
IOUtils.getDecodingReader(
|
} else {
|
||||||
UkrainianMorfologikAnalyzer.class,
|
dictionary =
|
||||||
DEFAULT_STOPWORD_FILE,
|
|
||||||
StandardCharsets.UTF_8));
|
|
||||||
DICTIONARY =
|
|
||||||
Dictionary.read(
|
Dictionary.read(
|
||||||
|
Objects.requireNonNull(
|
||||||
UkrainianMorfologikAnalyzer.class
|
UkrainianMorfologikAnalyzer.class
|
||||||
.getClassLoader()
|
.getClassLoader()
|
||||||
.getResource("ua/net/nlp/ukrainian.dict"));
|
.getResource("ua/net/nlp/ukrainian.dict"),
|
||||||
} catch (IOException ex) {
|
"Could not locate the required Ukrainian dictionary resource."));
|
||||||
// default set should always be present as it is part of the
|
|
||||||
// distribution (JAR)
|
|
||||||
throw new UncheckedIOException("Unable to load analyzer resources", ex);
|
|
||||||
}
|
}
|
||||||
|
defaultResources = new DefaultResources(wordList, dictionary);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new UncheckedIOException(
|
||||||
|
"Could not load the required resources for the Ukrainian analyzer.", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return defaultResources;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class DefaultResources {
|
||||||
|
final CharArraySet stopSet;
|
||||||
|
final Dictionary dictionary;
|
||||||
|
|
||||||
|
private DefaultResources(CharArraySet stopSet, Dictionary dictionary) {
|
||||||
|
this.stopSet = stopSet;
|
||||||
|
this.dictionary = dictionary;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
|
/** Builds an analyzer with the default stop words. */
|
||||||
public UkrainianMorfologikAnalyzer() {
|
public UkrainianMorfologikAnalyzer() {
|
||||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
this(getDefaultResources().stopSet);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -129,6 +152,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(stopwords);
|
super(stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||||
|
this.dictionary = getDefaultResources().dictionary;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -155,7 +179,7 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||||
}
|
}
|
||||||
|
|
||||||
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
|
result = new MorfologikFilter(result, dictionary);
|
||||||
return new TokenStreamComponents(source, result);
|
return new TokenStreamComponents(source, result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue