LUCENE-10413: Make default Ukrainian stopword set available (#665)

This commit adds a new getDefaultStopwords() static method to
UkrainianMorfologikAnalyzer, which makes it possible to create an
analyzer with the default stop word set but a custom stem exclusion
set.
This commit is contained in:
Alan Woodward 2022-02-09 14:37:44 +00:00 committed by GitHub
parent 8178ffda00
commit 2183756f1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 7 deletions

View File

@ -267,6 +267,8 @@ Other
and discover classes to check from module system. The test now checks all analyzer modules, and discover classes to check from module system. The test now checks all analyzer modules,
so it may discover new bugs outside of analysis:common module. (Uwe Schindler, Robert Muir) so it may discover new bugs outside of analysis:common module. (Uwe Schindler, Robert Muir)
* LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward)
======================= Lucene 9.0.0 ======================= ======================= Lucene 9.0.0 =======================
New Features New Features

View File

@ -113,14 +113,11 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
return defaultResources; return defaultResources;
} }
private static class DefaultResources { private record DefaultResources(CharArraySet stopSet, Dictionary dictionary) {}
final CharArraySet stopSet;
final Dictionary dictionary;
private DefaultResources(CharArraySet stopSet, Dictionary dictionary) { /** Returns the default stopword set for this analyzer */
this.stopSet = stopSet; public static CharArraySet getDefaultStopwords() {
this.dictionary = dictionary; return CharArraySet.unmodifiableSet(getDefaultResources().stopSet);
}
} }
/** Builds an analyzer with the default stop words. */ /** Builds an analyzer with the default stop words. */

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uk;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
/** Test case for UkrainianAnalyzer. */ /** Test case for UkrainianAnalyzer. */
@ -99,4 +100,11 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close(); analyzer.close();
} }
public void testDefaultStopWords() {
CharArraySet stopwords = UkrainianMorfologikAnalyzer.getDefaultStopwords();
assertTrue(stopwords.contains("аби"));
stopwords.remove("аби");
assertTrue(UkrainianMorfologikAnalyzer.getDefaultStopwords().contains("аби"));
}
} }