mirror of https://github.com/apache/lucene.git
SOLR-1860: support snowball format in stoplists
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1240784 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bef6e3664d
commit
c880ef0647
|
@ -475,6 +475,10 @@ New Features
|
|||
* LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
|
||||
(Christian Moen, Masaru Hasegawa via Robert Muir)
|
||||
|
||||
* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and
|
||||
CommonGramsQueryFilterFactory can optionally read stopwords in Snowball
|
||||
format (specify format="snowball"). (Robert Muir)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
|
||||
|
|
|
@ -23,11 +23,18 @@ import org.apache.solr.core.Config;
|
|||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
|
@ -129,4 +136,34 @@ abstract class BaseTokenStreamFactory {
|
|||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
|
||||
* except the input is in snowball format. */
|
||||
protected CharArraySet getSnowballWordSet(ResourceLoader loader,
|
||||
String wordFiles, boolean ignoreCase) throws IOException {
|
||||
assureMatchVersion();
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
CharArraySet words = null;
|
||||
if (files.size() > 0) {
|
||||
// default stopwords list has 35 or so words, but maybe don't make it that
|
||||
// big to start
|
||||
words = new CharArraySet(luceneMatchVersion,
|
||||
files.size() * 10, ignoreCase);
|
||||
for (String file : files) {
|
||||
InputStream stream = null;
|
||||
Reader reader = null;
|
||||
try {
|
||||
stream = loader.openResource(file.trim());
|
||||
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
reader = new InputStreamReader(stream, decoder);
|
||||
WordlistLoader.getSnowballWordSet(reader, words);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(reader, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
return words;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,7 +50,11 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
|||
|
||||
if (commonWordFiles != null) {
|
||||
try {
|
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||
if ("snowball".equalsIgnoreCase(args.get("format"))) {
|
||||
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
|
||||
} else {
|
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -57,7 +57,11 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
|||
|
||||
if (commonWordFiles != null) {
|
||||
try {
|
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||
if ("snowball".equalsIgnoreCase(args.get("format"))) {
|
||||
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
|
||||
} else {
|
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -56,7 +56,11 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
|
||||
if (stopWordFiles != null) {
|
||||
try {
|
||||
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||
if ("snowball".equalsIgnoreCase(args.get("format"))) {
|
||||
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
|
||||
} else {
|
||||
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
| This is a file in snowball format, empty lines are ignored, '|' is a comment
|
||||
| Additionally, multiple words can be on the same line, allowing stopwords to be
|
||||
| arranged in tables (useful in some languages where they might inflect)
|
||||
|
||||
| fictitious table below
|
||||
|
||||
|third person singular
|
||||
|Subject Object Possessive Reflexive
|
||||
he him his himself| masculine
|
||||
she her hers herself| feminine
|
|
@ -62,6 +62,21 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
|||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
|
||||
.isIgnoreCase() == true);
|
||||
|
||||
factory = new CommonGramsFilterFactory();
|
||||
args.put("words", "stop-snowball.txt");
|
||||
args.put("format", "snowball");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getCommonWords();
|
||||
assertEquals(8, words.size());
|
||||
assertTrue(words.contains("he"));
|
||||
assertTrue(words.contains("him"));
|
||||
assertTrue(words.contains("his"));
|
||||
assertTrue(words.contains("himself"));
|
||||
assertTrue(words.contains("she"));
|
||||
assertTrue(words.contains("her"));
|
||||
assertTrue(words.contains("hers"));
|
||||
assertTrue(words.contains("herself"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -61,6 +61,21 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
|||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
|
||||
.isIgnoreCase() == true);
|
||||
|
||||
factory = new CommonGramsQueryFilterFactory();
|
||||
args.put("words", "stop-snowball.txt");
|
||||
args.put("format", "snowball");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getCommonWords();
|
||||
assertEquals(8, words.size());
|
||||
assertTrue(words.contains("he"));
|
||||
assertTrue(words.contains("him"));
|
||||
assertTrue(words.contains("his"));
|
||||
assertTrue(words.contains("himself"));
|
||||
assertTrue(words.contains("she"));
|
||||
assertTrue(words.contains("her"));
|
||||
assertTrue(words.contains("hers"));
|
||||
assertTrue(words.contains("herself"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -53,6 +53,20 @@ public class TestStopFilterFactory extends BaseTokenTestCase {
|
|||
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||
|
||||
|
||||
factory = new StopFilterFactory();
|
||||
args.put("words", "stop-snowball.txt");
|
||||
args.put("format", "snowball");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getStopWords();
|
||||
assertEquals(8, words.size());
|
||||
assertTrue(words.contains("he"));
|
||||
assertTrue(words.contains("him"));
|
||||
assertTrue(words.contains("his"));
|
||||
assertTrue(words.contains("himself"));
|
||||
assertTrue(words.contains("she"));
|
||||
assertTrue(words.contains("her"));
|
||||
assertTrue(words.contains("hers"));
|
||||
assertTrue(words.contains("herself"));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue