From c880ef064789401f1351af56755ea165b3e04444 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 5 Feb 2012 19:33:56 +0000 Subject: [PATCH] SOLR-1860: support snowball format in stoplists git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1240784 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 ++ .../solr/analysis/BaseTokenStreamFactory.java | 37 +++++++++++++++++++ .../analysis/CommonGramsFilterFactory.java | 6 ++- .../CommonGramsQueryFilterFactory.java | 6 ++- .../solr/analysis/StopFilterFactory.java | 6 ++- .../test-files/solr/conf/stop-snowball.txt | 10 +++++ .../CommonGramsFilterFactoryTest.java | 15 ++++++++ .../CommonGramsQueryFilterFactoryTest.java | 15 ++++++++ .../solr/analysis/TestStopFilterFactory.java | 16 +++++++- 9 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 solr/core/src/test-files/solr/conf/stop-snowball.txt diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d2d5aeafee3..0baec610661 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -475,6 +475,10 @@ New Features * LUCENE-3305, SOLR-3056: Added Kuromoji morphological analyzer for Japanese. (Christian Moen, Masaru Hasegawa via Robert Muir) +* SOLR-1860: StopFilterFactory, CommonGramsFilterFactory, and + CommonGramsQueryFilterFactory can optionally read stopwords in Snowball + format (specify format="snowball"). (Robert Muir) + Optimizations ---------------------- * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter diff --git a/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java b/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java index bc5adddd0ee..40fedb66524 100644 --- a/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java @@ -23,11 +23,18 @@ import org.apache.solr.core.Config; import org.apache.solr.schema.IndexSchema; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.slf4j.Logger; @@ -129,4 +136,34 @@ abstract class BaseTokenStreamFactory { } return words; } + + /** same as {@link #getWordSet(ResourceLoader, String, boolean)}, + * except the input is in snowball format. */ + protected CharArraySet getSnowballWordSet(ResourceLoader loader, + String wordFiles, boolean ignoreCase) throws IOException { + assureMatchVersion(); + List files = StrUtils.splitFileNames(wordFiles); + CharArraySet words = null; + if (files.size() > 0) { + // default stopwords list has 35 or so words, but maybe don't make it that + // big to start + words = new CharArraySet(luceneMatchVersion, + files.size() * 10, ignoreCase); + for (String file : files) { + InputStream stream = null; + Reader reader = null; + try { + stream = loader.openResource(file.trim()); + CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + reader = new InputStreamReader(stream, decoder); + WordlistLoader.getSnowballWordSet(reader, words); + } finally { + IOUtils.closeWhileHandlingException(reader, stream); + } + } + } + return words; + } } diff --git a/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java index 5eef2dd4c01..87b2a308bab 100644 --- a/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java @@ -50,7 +50,11 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements if (commonWordFiles != null) { try { - commonWords = getWordSet(loader, commonWordFiles, ignoreCase); + if ("snowball".equalsIgnoreCase(args.get("format"))) { + commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); + } else { + commonWords = getWordSet(loader, commonWordFiles, ignoreCase); + } } catch (IOException e) { throw new RuntimeException(e); } diff --git a/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java index e1bb85c350d..f70dfac9fb2 100644 --- a/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java @@ -57,7 +57,11 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory if (commonWordFiles != null) { try { - commonWords = getWordSet(loader, commonWordFiles, ignoreCase); + if ("snowball".equalsIgnoreCase(args.get("format"))) { + commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); + } else { + commonWords = getWordSet(loader, commonWordFiles, ignoreCase); + } } catch (IOException e) { throw new RuntimeException(e); } diff --git a/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java index 6c8691ec646..143137e377a 100644 --- a/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java @@ -56,7 +56,11 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc if (stopWordFiles != null) { try { - stopWords = getWordSet(loader, stopWordFiles, ignoreCase); + if ("snowball".equalsIgnoreCase(args.get("format"))) { + stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); + } else { + stopWords = getWordSet(loader, stopWordFiles, ignoreCase); + } } catch (IOException e) { throw new RuntimeException(e); } diff --git a/solr/core/src/test-files/solr/conf/stop-snowball.txt b/solr/core/src/test-files/solr/conf/stop-snowball.txt new file mode 100644 index 00000000000..1c0c6f51142 --- /dev/null +++ b/solr/core/src/test-files/solr/conf/stop-snowball.txt @@ -0,0 +1,10 @@ + | This is a file in snowball format, empty lines are ignored, '|' is a comment + | Additionally, multiple words can be on the same line, allowing stopwords to be + | arranged in tables (useful in some languages where they might inflect) + + | fictitious table below + +|third person singular +|Subject Object Possessive Reflexive +he him his himself| masculine +she her hers herself| feminine diff --git a/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java b/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java index 3757663cb1f..4f18f071b9e 100644 --- a/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java @@ -62,6 +62,21 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase { assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory .isIgnoreCase() == true); + factory = new CommonGramsFilterFactory(); + args.put("words", "stop-snowball.txt"); + args.put("format", "snowball"); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); } /** diff --git a/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java b/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java index 711752b5acb..bf0afffdab0 100644 --- a/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java @@ -61,6 +61,21 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase { assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory .isIgnoreCase() == true); + factory = new CommonGramsQueryFilterFactory(); + args.put("words", "stop-snowball.txt"); + args.put("format", "snowball"); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); } /** diff --git a/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java index f799c85907a..dae4551bf4a 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java @@ -53,6 +53,20 @@ public class TestStopFilterFactory extends BaseTokenTestCase { assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); - + factory = new StopFilterFactory(); + args.put("words", "stop-snowball.txt"); + args.put("format", "snowball"); + factory.init(args); + factory.inform(loader); + words = factory.getStopWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); } }