mirror of https://github.com/apache/lucene.git
LUCENE-9567: JPOSSFF loads built-in stop tags by default (#1961)
load stoptags.txt from analysis-kuromoji when no tags argument is specified
This commit is contained in:
parent
47a3f591a8
commit
4e0aa0d23b
|
@ -1,5 +1,11 @@
|
|||
# Apache Lucene Migration Guide
|
||||
|
||||
## JapanesePartOfSpeechStopFilterFactory loads default stop tags if "tags" argument not specified (LUCENE-9567)
|
||||
|
||||
Previously, JapanesePartOfSpeechStopFilterFactory added no filter if `args` didn't include "tags". Now, it will load
|
||||
the default stop tags returned by `JapaneseAnalyzer.getDefaultStopTags()` (i.e. the tags from`stoptags.txt` in the
|
||||
`lucene-analyzers-kuromoji` jar.)
|
||||
|
||||
## ICUCollationKeyAnalyzer is renamed (LUCENE-9558)
|
||||
|
||||
o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer.
|
||||
|
|
|
@ -53,6 +53,9 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
|||
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
stopTagFiles = get(args, "tags");
|
||||
if (stopTagFiles == null) {
|
||||
stopTags = JapaneseAnalyzer.getDefaultStopTags();
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -65,13 +68,15 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
|||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
stopTags = null;
|
||||
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
|
||||
if (cas != null) {
|
||||
stopTags = new HashSet<>();
|
||||
for (Object element : cas) {
|
||||
char chars[] = (char[]) element;
|
||||
stopTags.add(new String(chars));
|
||||
if (stopTagFiles != null) {
|
||||
stopTags = null;
|
||||
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
|
||||
if (cas != null) {
|
||||
stopTags = new HashSet<>();
|
||||
for (Object element : cas) {
|
||||
char chars[] = (char[]) element;
|
||||
stopTags.add(new String(chars));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Map;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -50,6 +51,22 @@ public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTe
|
|||
new String[] { "私", "は", "制限", "スピード", "を" }
|
||||
);
|
||||
}
|
||||
|
||||
/** If we don't specify "tags", then load the default stop tags. */
|
||||
public void testNoTagsSpecified() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
|
||||
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||
TokenStream ts = tokenizerFactory.create();
|
||||
((Tokenizer)ts).setReader(new StringReader("私は制限スピードを超える。"));
|
||||
Map<String,String> args = new HashMap<>();
|
||||
args.put("luceneMatchVersion", Version.LATEST.toString());
|
||||
JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
|
||||
factory.inform(new ClasspathResourceLoader(JapaneseAnalyzer.class));
|
||||
ts = factory.create(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "私", "制限", "スピード", "超える" }
|
||||
);
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue