LUCENE-9567: JPOSSFF loads built-in stop tags by default (#1961)

load stoptags.txt from analysis-kuromoji when no tags argument is specified
2020-10-09 07:52:07 -07:00 · 2020-10-09 07:52:07 -07:00 · 4e0aa0d23b
parent 47a3f591a8
commit 4e0aa0d23b
3 changed files with 35 additions and 7 deletions
--- a/lucene/MIGRATE.md
+++ b/lucene/MIGRATE.md
@ -1,5 +1,11 @@
 # Apache Lucene Migration Guide

+## JapanesePartOfSpeechStopFilterFactory loads default stop tags if "tags" argument not specified (LUCENE-9567)
+
+Previously, JapanesePartOfSpeechStopFilterFactory added no filter if `args` didn't include "tags". Now, it will load 
+the default stop tags returned by `JapaneseAnalyzer.getDefaultStopTags()` (i.e. the tags from`stoptags.txt` in the 
+`lucene-analyzers-kuromoji` jar.)
+
 ## ICUCollationKeyAnalyzer is renamed (LUCENE-9558)

 o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer.
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
@ -53,6 +53,9 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
  public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
    super(args);
    stopTagFiles = get(args, "tags");
+    if (stopTagFiles == null) {
+      stopTags = JapaneseAnalyzer.getDefaultStopTags();
+    }
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -65,13 +68,15 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im

  @Override
  public void inform(ResourceLoader loader) throws IOException {
-    stopTags = null;
-    CharArraySet cas = getWordSet(loader, stopTagFiles, false);
-    if (cas != null) {
-      stopTags = new HashSet<>();
-      for (Object element : cas) {
-        char chars[] = (char[]) element;
-        stopTags.add(new String(chars));
+    if (stopTagFiles != null) {
+      stopTags = null;
+      CharArraySet cas = getWordSet(loader, stopTagFiles, false);
+      if (cas != null) {
+        stopTags = new HashSet<>();
+        for (Object element : cas) {
+          char chars[] = (char[]) element;
+          stopTags.add(new String(chars));
+        }
      }
    }
  }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java
@ -25,6 +25,7 @@ import java.util.Map;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.ClasspathResourceLoader;
 import org.apache.lucene.util.Version;

 /**
@ -50,6 +51,22 @@ public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTe
        new String[] { "私", "は", "制限", "スピード", "を" }
    );
  }
+
+  /** If we don't specify "tags", then load the default stop tags. */
+  public void testNoTagsSpecified() throws IOException {
+    JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
+    tokenizerFactory.inform(new StringMockResourceLoader(""));
+    TokenStream ts = tokenizerFactory.create();
+    ((Tokenizer)ts).setReader(new StringReader("私は制限スピードを超える。"));
+    Map<String,String> args = new HashMap<>();
+    args.put("luceneMatchVersion", Version.LATEST.toString());
+    JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
+    factory.inform(new ClasspathResourceLoader(JapaneseAnalyzer.class));
+    ts = factory.create(ts);
+    assertTokenStreamContents(ts,
+            new String[] { "私", "制限", "スピード", "超える" }
+    );
+  }
  
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {