LUCENE-9567: JPOSSFF loads built-in stop tags by default (#1961)

load stoptags.txt from analysis-kuromoji when no tags argument is specified
This commit is contained in:
msfroh 2020-10-09 07:52:07 -07:00 committed by GitHub
parent 47a3f591a8
commit 4e0aa0d23b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 7 deletions

View File

@ -1,5 +1,11 @@
# Apache Lucene Migration Guide
## JapanesePartOfSpeechStopFilterFactory loads default stop tags if "tags" argument not specified (LUCENE-9567)
Previously, JapanesePartOfSpeechStopFilterFactory added no filter if `args` didn't include "tags". Now, it will load
the default stop tags returned by `JapaneseAnalyzer.getDefaultStopTags()` (i.e. the tags from`stoptags.txt` in the
`lucene-analyzers-kuromoji` jar.)
## ICUCollationKeyAnalyzer is renamed (LUCENE-9558)
o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer.

View File

@ -53,6 +53,9 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
super(args);
stopTagFiles = get(args, "tags");
if (stopTagFiles == null) {
stopTags = JapaneseAnalyzer.getDefaultStopTags();
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -65,13 +68,15 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
@Override
public void inform(ResourceLoader loader) throws IOException {
stopTags = null;
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
if (cas != null) {
stopTags = new HashSet<>();
for (Object element : cas) {
char chars[] = (char[]) element;
stopTags.add(new String(chars));
if (stopTagFiles != null) {
stopTags = null;
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
if (cas != null) {
stopTags = new HashSet<>();
for (Object element : cas) {
char chars[] = (char[]) element;
stopTags.add(new String(chars));
}
}
}
}

View File

@ -25,6 +25,7 @@ import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.ClasspathResourceLoader;
import org.apache.lucene.util.Version;
/**
@ -50,6 +51,22 @@ public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTe
new String[] { "", "", "制限", "スピード", "" }
);
}
/** If we don't specify "tags", then load the default stop tags. */
public void testNoTagsSpecified() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream ts = tokenizerFactory.create();
((Tokenizer)ts).setReader(new StringReader("私は制限スピードを超える。"));
Map<String,String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
factory.inform(new ClasspathResourceLoader(JapaneseAnalyzer.class));
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "", "制限", "スピード", "超える" }
);
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {