mirror of https://github.com/apache/lucene.git
LUCENE-5211: Better javadocs and error checking of 'format' option in StopFilterFactory, as well as comments in all snowball formated files about specifying format option
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1524809 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c7188a03a3
commit
499bd3688a
|
@ -78,6 +78,12 @@ API Changes:
|
|||
with IndexSearcher when an ExecutorService is specified.
|
||||
(Ryan Ernst, Mike McCandless, Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-5211: Better javadocs and error checking of 'format' option in
|
||||
StopFilterFactory, as well as comments in all snowball formated files
|
||||
about specifying format option. (hossman)
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
||||
* LUCENE-5204: Directory doesn't have default implementations for
|
||||
|
|
|
@ -22,22 +22,57 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader; // jdocs
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Factory for {@link StopFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_stop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
* words="stopwords.txt"
|
||||
* words="stopwords.txt" format="wordset"
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* <p>
|
||||
* All attributes are optional:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li><code>ignoreCase</code> defaults to <code>false</code></li>
|
||||
* <li><code>words</code> should be the name of a stopwords file to parse, if not
|
||||
* specified the factory will use {@link StopAnalyzer#ENGLISH_STOP_WORDS_SET}
|
||||
* </li>
|
||||
* <li><code>format</code> defines how the <code>words</code> file will be parsed,
|
||||
* and defaults to <code>wordset</code>. If <code>words</code> is not specified,
|
||||
* then <code>format</code> must not be specified.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* The valid values for the <code>format</code> option are:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li><code>wordset</code> - This is the default format, which supports one word per
|
||||
* line (including any intra-word whitespace) and allows whole line comments
|
||||
* begining with the "#" character. Blank lines are ignored. See
|
||||
* {@link WordlistLoader#getLines WordlistLoader.getLines} for details.
|
||||
* </li>
|
||||
* <li><code>snowball</code> - This format allows for multiple words specified on each
|
||||
* line, and trailing comments may be specified using the vertical line ("|").
|
||||
* Blank lines are ignored. See
|
||||
* {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
|
||||
* for details.
|
||||
* </li>
|
||||
* </ul>
|
||||
*/
|
||||
public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String FORMAT_WORDSET = "wordset";
|
||||
public static final String FORMAT_SNOWBALL = "snowball";
|
||||
|
||||
private CharArraySet stopWords;
|
||||
private final String stopWordFiles;
|
||||
private final String format;
|
||||
|
@ -48,7 +83,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
|||
super(args);
|
||||
assureMatchVersion();
|
||||
stopWordFiles = get(args, "words");
|
||||
format = get(args, "format");
|
||||
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
|
||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
|
@ -58,12 +93,17 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
|||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
if (stopWordFiles != null) {
|
||||
if ("snowball".equalsIgnoreCase(format)) {
|
||||
if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
|
||||
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||
} else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
|
||||
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
|
||||
} else {
|
||||
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||
throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format);
|
||||
}
|
||||
} else {
|
||||
if (null != format) {
|
||||
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
|
||||
}
|
||||
stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Dutch stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| An English stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| forms of BE
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A French stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| Hungarian stop word list
|
||||
| prepared by Anna Tordai
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| An Italian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -57,6 +57,11 @@ public class TestStopFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
assertTrue(words.contains("her"));
|
||||
assertTrue(words.contains("hers"));
|
||||
assertTrue(words.contains("herself"));
|
||||
|
||||
// defaults
|
||||
factory = (StopFilterFactory) tokenFilterFactory("Stop");
|
||||
assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
|
||||
assertEquals(false, factory.isIgnoreCase());
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
|
@ -68,4 +73,30 @@ public class TestStopFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusFormats() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Stop",
|
||||
"words", "stop-snowball.txt",
|
||||
"format", "bogus");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
String msg = expected.getMessage();
|
||||
assertTrue(msg, msg.contains("Unknown"));
|
||||
assertTrue(msg, msg.contains("format"));
|
||||
assertTrue(msg, msg.contains("bogus"));
|
||||
}
|
||||
try {
|
||||
tokenFilterFactory("Stop",
|
||||
// implicit default words file
|
||||
"format", "bogus");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
String msg = expected.getMessage();
|
||||
assertTrue(msg, msg.contains("can not be specified"));
|
||||
assertTrue(msg, msg.contains("format"));
|
||||
assertTrue(msg, msg.contains("bogus"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| forms of BE
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A French stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| Hungarian stop word list
|
||||
| prepared by Anna Tordai
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| An Italian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Dutch stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| forms of BE
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A French stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| Hungarian stop word list
|
||||
| prepared by Anna Tordai
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| An Italian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Dutch stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
|
Loading…
Reference in New Issue