LUCENE-10413: Make default Ukrainian stopword set available (#665)

This commit adds a new getDefaultStopwords() static method to UkrainianMorfologikAnalyzer, which makes it possible to create an analyzer with the default stop word set but a custom stem exclusion set.
2022-02-09 14:37:44 +00:00 · 2022-02-09 14:37:44 +00:00 · 2183756f1c
parent 8178ffda00
commit 2183756f1c
3 changed files with 14 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -267,6 +267,8 @@ Other
  and discover classes to check from module system. The test now checks all analyzer modules,
  so it may discover new bugs outside of analysis:common module.  (Uwe Schindler, Robert Muir)
 * LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward)
 ======================= Lucene 9.0.0 =======================
 New Features
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
@ -113,14 +113,11 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
    return defaultResources;
  }
-  private static class DefaultResources {
+  private record DefaultResources(CharArraySet stopSet, Dictionary dictionary) {}
    final CharArraySet stopSet;
    final Dictionary dictionary;
-    private DefaultResources(CharArraySet stopSet, Dictionary dictionary) {
+  /** Returns the default stopword set for this analyzer */
-      this.stopSet = stopSet;
+  public static CharArraySet getDefaultStopwords() {
-      this.dictionary = dictionary;
+    return CharArraySet.unmodifiableSet(getDefaultResources().stopSet);
    }
  }
  /** Builds an analyzer with the default stop words. */
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uk;
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 /** Test case for UkrainianAnalyzer. */
@ -99,4 +100,11 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
    analyzer.close();
  }
  public void testDefaultStopWords() {
    CharArraySet stopwords = UkrainianMorfologikAnalyzer.getDefaultStopwords();
    assertTrue(stopwords.contains("аби"));
    stopwords.remove("аби");
    assertTrue(UkrainianMorfologikAnalyzer.getDefaultStopwords().contains("аби"));
  }
 }