LUCENE-9567: JPOSSFF loads built-in stop tags by default (#1961)

load stoptags.txt from analysis-kuromoji when no tags argument is specified
This commit is contained in:
msfroh 2020-10-09 07:52:07 -07:00 committed by GitHub
parent 47a3f591a8
commit 4e0aa0d23b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 7 deletions

View File

@ -1,5 +1,11 @@
# Apache Lucene Migration Guide # Apache Lucene Migration Guide
## JapanesePartOfSpeechStopFilterFactory loads default stop tags if "tags" argument not specified (LUCENE-9567)
Previously, JapanesePartOfSpeechStopFilterFactory added no filter if `args` didn't include "tags". Now, it will load
the default stop tags returned by `JapaneseAnalyzer.getDefaultStopTags()` (i.e. the tags from`stoptags.txt` in the
`lucene-analyzers-kuromoji` jar.)
## ICUCollationKeyAnalyzer is renamed (LUCENE-9558) ## ICUCollationKeyAnalyzer is renamed (LUCENE-9558)
o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer. o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer.

View File

@ -53,6 +53,9 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) { public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
super(args); super(args);
stopTagFiles = get(args, "tags"); stopTagFiles = get(args, "tags");
if (stopTagFiles == null) {
stopTags = JapaneseAnalyzer.getDefaultStopTags();
}
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -65,6 +68,7 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
@Override @Override
public void inform(ResourceLoader loader) throws IOException { public void inform(ResourceLoader loader) throws IOException {
if (stopTagFiles != null) {
stopTags = null; stopTags = null;
CharArraySet cas = getWordSet(loader, stopTagFiles, false); CharArraySet cas = getWordSet(loader, stopTagFiles, false);
if (cas != null) { if (cas != null) {
@ -75,6 +79,7 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
} }
} }
} }
}
@Override @Override
public TokenStream create(TokenStream stream) { public TokenStream create(TokenStream stream) {

View File

@ -25,6 +25,7 @@ import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.ClasspathResourceLoader;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
/** /**
@ -51,6 +52,22 @@ public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTe
); );
} }
/** If we don't specify "tags", then load the default stop tags. */
public void testNoTagsSpecified() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream ts = tokenizerFactory.create();
((Tokenizer)ts).setReader(new StringReader("私は制限スピードを超える。"));
Map<String,String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
factory.inform(new ClasspathResourceLoader(JapaneseAnalyzer.class));
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "", "制限", "スピード", "超える" }
);
}
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {