LUCENE-10413: Make default Ukrainian stopword set available (#665)

This commit adds a new getDefaultStopwords() static method to
UkrainianMorfologikAnalyzer, which makes it possible to create an
analyzer with the default stop word set but a custom stem exclusion
set.
This commit is contained in:
Alan Woodward 2022-02-09 14:37:44 +00:00 committed by GitHub
parent 8178ffda00
commit 2183756f1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 7 deletions

View File

@ -267,6 +267,8 @@ Other
and discover classes to check from module system. The test now checks all analyzer modules,
so it may discover new bugs outside of analysis:common module. (Uwe Schindler, Robert Muir)
* LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward)
======================= Lucene 9.0.0 =======================
New Features

View File

@ -113,14 +113,11 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
return defaultResources;
}
private static class DefaultResources {
final CharArraySet stopSet;
final Dictionary dictionary;
private record DefaultResources(CharArraySet stopSet, Dictionary dictionary) {}
private DefaultResources(CharArraySet stopSet, Dictionary dictionary) {
this.stopSet = stopSet;
this.dictionary = dictionary;
}
/** Returns the default stopword set for this analyzer */
public static CharArraySet getDefaultStopwords() {
return CharArraySet.unmodifiableSet(getDefaultResources().stopSet);
}
/** Builds an analyzer with the default stop words. */

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uk;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
/** Test case for UkrainianAnalyzer. */
@ -99,4 +100,11 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
public void testDefaultStopWords() {
CharArraySet stopwords = UkrainianMorfologikAnalyzer.getDefaultStopwords();
assertTrue(stopwords.contains("аби"));
stopwords.remove("аби");
assertTrue(UkrainianMorfologikAnalyzer.getDefaultStopwords().contains("аби"));
}
}