SOLR-1860: support snowball format in stoplists

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1240784 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-05 19:33:56 +00:00
parent bef6e3664d
commit c880ef0647
9 changed files with 111 additions and 4 deletions

View File

@ -475,6 +475,10 @@ New Features
* LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese. * LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese.
(Christian Moen, Masaru Hasegawa via Robert Muir) (Christian Moen, Masaru Hasegawa via Robert Muir)
* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and
CommonGramsQueryFilterFactory can optionally read stopwords in Snowball
format (specify format="snowball"). (Robert Muir)
Optimizations Optimizations
---------------------- ----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter

View File

@ -23,11 +23,18 @@ import org.apache.solr.core.Config;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -129,4 +136,34 @@ abstract class BaseTokenStreamFactory {
} }
return words; return words;
} }
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
* except the input is in snowball format. */
protected CharArraySet getSnowballWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
assureMatchVersion();
List<String> files = StrUtils.splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
InputStream stream = null;
Reader reader = null;
try {
stream = loader.openResource(file.trim());
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader = new InputStreamReader(stream, decoder);
WordlistLoader.getSnowballWordSet(reader, words);
} finally {
IOUtils.closeWhileHandlingException(reader, stream);
}
}
}
return words;
}
} }

View File

@ -50,7 +50,11 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
if (commonWordFiles != null) { if (commonWordFiles != null) {
try { try {
commonWords = getWordSet(loader, commonWordFiles, ignoreCase); if ("snowball".equalsIgnoreCase(args.get("format"))) {
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
} else {
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }

View File

@ -57,7 +57,11 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
if (commonWordFiles != null) { if (commonWordFiles != null) {
try { try {
commonWords = getWordSet(loader, commonWordFiles, ignoreCase); if ("snowball".equalsIgnoreCase(args.get("format"))) {
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
} else {
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }

View File

@ -56,7 +56,11 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
if (stopWordFiles != null) { if (stopWordFiles != null) {
try { try {
stopWords = getWordSet(loader, stopWordFiles, ignoreCase); if ("snowball".equalsIgnoreCase(args.get("format"))) {
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
} else {
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }

View File

@ -0,0 +1,10 @@
| This is a file in snowball format, empty lines are ignored, '|' is a comment
| Additionally, multiple words can be on the same line, allowing stopwords to be
| arranged in tables (useful in some languages where they might inflect)
| fictitious table below
|third person singular
|Subject Object Possessive Reflexive
he him his himself| masculine
she her hers herself| feminine

View File

@ -62,6 +62,21 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true); .isIgnoreCase() == true);
factory = new CommonGramsFilterFactory();
args.put("words", "stop-snowball.txt");
args.put("format", "snowball");
factory.init(args);
factory.inform(loader);
words = factory.getCommonWords();
assertEquals(8, words.size());
assertTrue(words.contains("he"));
assertTrue(words.contains("him"));
assertTrue(words.contains("his"));
assertTrue(words.contains("himself"));
assertTrue(words.contains("she"));
assertTrue(words.contains("her"));
assertTrue(words.contains("hers"));
assertTrue(words.contains("herself"));
} }
/** /**

View File

@ -61,6 +61,21 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true); .isIgnoreCase() == true);
factory = new CommonGramsQueryFilterFactory();
args.put("words", "stop-snowball.txt");
args.put("format", "snowball");
factory.init(args);
factory.inform(loader);
words = factory.getCommonWords();
assertEquals(8, words.size());
assertTrue(words.contains("he"));
assertTrue(words.contains("him"));
assertTrue(words.contains("his"));
assertTrue(words.contains("himself"));
assertTrue(words.contains("she"));
assertTrue(words.contains("her"));
assertTrue(words.contains("hers"));
assertTrue(words.contains("herself"));
} }
/** /**

View File

@ -53,6 +53,20 @@ public class TestStopFilterFactory extends BaseTokenTestCase {
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
factory = new StopFilterFactory();
args.put("words", "stop-snowball.txt");
args.put("format", "snowball");
factory.init(args);
factory.inform(loader);
words = factory.getStopWords();
assertEquals(8, words.size());
assertTrue(words.contains("he"));
assertTrue(words.contains("him"));
assertTrue(words.contains("his"));
assertTrue(words.contains("himself"));
assertTrue(words.contains("she"));
assertTrue(words.contains("her"));
assertTrue(words.contains("hers"));
assertTrue(words.contains("herself"));
} }
} }