mirror of https://github.com/apache/lucene.git
LUCENE-9567: JPOSSFF loads built-in stop tags by default (#1961)
load stoptags.txt from analysis-kuromoji when no tags argument is specified
This commit is contained in:
parent
47a3f591a8
commit
4e0aa0d23b
|
@ -1,5 +1,11 @@
|
||||||
# Apache Lucene Migration Guide
|
# Apache Lucene Migration Guide
|
||||||
|
|
||||||
|
## JapanesePartOfSpeechStopFilterFactory loads default stop tags if "tags" argument not specified (LUCENE-9567)
|
||||||
|
|
||||||
|
Previously, JapanesePartOfSpeechStopFilterFactory added no filter if `args` didn't include "tags". Now, it will load
|
||||||
|
the default stop tags returned by `JapaneseAnalyzer.getDefaultStopTags()` (i.e. the tags from`stoptags.txt` in the
|
||||||
|
`lucene-analyzers-kuromoji` jar.)
|
||||||
|
|
||||||
## ICUCollationKeyAnalyzer is renamed (LUCENE-9558)
|
## ICUCollationKeyAnalyzer is renamed (LUCENE-9558)
|
||||||
|
|
||||||
o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer.
|
o.a.l.collation.ICUCollationAnalyzer is renamed to o.a.l.a.icu.ICUCollationKeyAnalyzer.
|
||||||
|
|
|
@ -53,6 +53,9 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
||||||
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
|
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
stopTagFiles = get(args, "tags");
|
stopTagFiles = get(args, "tags");
|
||||||
|
if (stopTagFiles == null) {
|
||||||
|
stopTags = JapaneseAnalyzer.getDefaultStopTags();
|
||||||
|
}
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -65,13 +68,15 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void inform(ResourceLoader loader) throws IOException {
|
public void inform(ResourceLoader loader) throws IOException {
|
||||||
stopTags = null;
|
if (stopTagFiles != null) {
|
||||||
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
|
stopTags = null;
|
||||||
if (cas != null) {
|
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
|
||||||
stopTags = new HashSet<>();
|
if (cas != null) {
|
||||||
for (Object element : cas) {
|
stopTags = new HashSet<>();
|
||||||
char chars[] = (char[]) element;
|
for (Object element : cas) {
|
||||||
stopTags.add(new String(chars));
|
char chars[] = (char[]) element;
|
||||||
|
stopTags.add(new String(chars));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Map;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.util.ClasspathResourceLoader;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -50,6 +51,22 @@ public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTe
|
||||||
new String[] { "私", "は", "制限", "スピード", "を" }
|
new String[] { "私", "は", "制限", "スピード", "を" }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** If we don't specify "tags", then load the default stop tags. */
|
||||||
|
public void testNoTagsSpecified() throws IOException {
|
||||||
|
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
|
||||||
|
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||||
|
TokenStream ts = tokenizerFactory.create();
|
||||||
|
((Tokenizer)ts).setReader(new StringReader("私は制限スピードを超える。"));
|
||||||
|
Map<String,String> args = new HashMap<>();
|
||||||
|
args.put("luceneMatchVersion", Version.LATEST.toString());
|
||||||
|
JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
|
||||||
|
factory.inform(new ClasspathResourceLoader(JapaneseAnalyzer.class));
|
||||||
|
ts = factory.create(ts);
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
new String[] { "私", "制限", "スピード", "超える" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue