From 2183756f1c8253002bb697bdb8e026e86c4b3db5 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 9 Feb 2022 14:37:44 +0000 Subject: [PATCH] LUCENE-10413: Make default Ukrainian stopword set available (#665) This commit adds a new getDefaultStopwords() static method to UkrainianMorfologikAnalyzer, which makes it possible to create an analyzer with the default stop word set but a custom stem exclusion set. --- lucene/CHANGES.txt | 2 ++ .../analysis/uk/UkrainianMorfologikAnalyzer.java | 11 ++++------- .../lucene/analysis/uk/TestUkrainianAnalyzer.java | 8 ++++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6f380a4965b..73909964b22 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -267,6 +267,8 @@ Other and discover classes to check from module system. The test now checks all analyzer modules, so it may discover new bugs outside of analysis:common module. (Uwe Schindler, Robert Muir) +* LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward) + ======================= Lucene 9.0.0 ======================= New Features diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java index eb29448ca7d..558239a95d9 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java @@ -113,14 +113,11 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { return defaultResources; } - private static class DefaultResources { - final CharArraySet stopSet; - final Dictionary dictionary; + private record DefaultResources(CharArraySet stopSet, Dictionary dictionary) {} - private DefaultResources(CharArraySet stopSet, Dictionary dictionary) { - this.stopSet = stopSet; - this.dictionary = dictionary; - } + /** Returns the default stopword set for this analyzer */ + public static CharArraySet getDefaultStopwords() { + return CharArraySet.unmodifiableSet(getDefaultResources().stopSet); } /** Builds an analyzer with the default stop words. */ diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java index e04f62639db..f984ff8c18c 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uk; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; /** Test case for UkrainianAnalyzer. */ @@ -99,4 +100,11 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase { checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); analyzer.close(); } + + public void testDefaultStopWords() { + CharArraySet stopwords = UkrainianMorfologikAnalyzer.getDefaultStopwords(); + assertTrue(stopwords.contains("аби")); + stopwords.remove("аби"); + assertTrue(UkrainianMorfologikAnalyzer.getDefaultStopwords().contains("аби")); + } }