diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index e66470834ac..3c70852bc7e 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -1,5 +1,11 @@ # Apache Lucene Migration Guide +## JapanesePartOfSpeechStopFilterFactory loads default stop tags if "tags" argument not specified (LUCENE-9567) + +Previously, JapanesePartOfSpeechStopFilterFactory added no filter if `args` didn't include "tags". Now, it will load +the default stop tags returned by `JapaneseAnalyzer.getDefaultStopTags()` (i.e. the tags from`stoptags.txt` in the +`lucene-analyzers-kuromoji` jar.) + ## ICUCollationKeyAnalyzer is renamed (LUCENE-9558) o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer. diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java index 9ba85c7e530..0e69d0180e1 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java @@ -53,6 +53,9 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im public JapanesePartOfSpeechStopFilterFactory(Map args) { super(args); stopTagFiles = get(args, "tags"); + if (stopTagFiles == null) { + stopTags = JapaneseAnalyzer.getDefaultStopTags(); + } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -65,13 +68,15 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im @Override public void inform(ResourceLoader loader) throws IOException { - stopTags = null; - CharArraySet cas = getWordSet(loader, stopTagFiles, false); - if (cas != null) { - stopTags = new HashSet<>(); - for (Object element : cas) { - char chars[] = (char[]) element; - stopTags.add(new String(chars)); + if (stopTagFiles != null) { + stopTags = null; + CharArraySet cas = getWordSet(loader, stopTagFiles, false); + if (cas != null) { + stopTags = new HashSet<>(); + for (Object element : cas) { + char chars[] = (char[]) element; + stopTags.add(new String(chars)); + } } } } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java index 501d2fca408..6c728a05ef4 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java @@ -25,6 +25,7 @@ import java.util.Map; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.ClasspathResourceLoader; import org.apache.lucene.util.Version; /** @@ -50,6 +51,22 @@ public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTe new String[] { "私", "は", "制限", "スピード", "を" } ); } + + /** If we don't specify "tags", then load the default stop tags. */ + public void testNoTagsSpecified() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap()); + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream ts = tokenizerFactory.create(); + ((Tokenizer)ts).setReader(new StringReader("私は制限スピードを超える。")); + Map args = new HashMap<>(); + args.put("luceneMatchVersion", Version.LATEST.toString()); + JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args); + factory.inform(new ClasspathResourceLoader(JapaneseAnalyzer.class)); + ts = factory.create(ts); + assertTokenStreamContents(ts, + new String[] { "私", "制限", "スピード", "超える" } + ); + } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception {