LUCENE-5211: Better javadocs and error checking of 'format' option in StopFilterFactory, as well as comments in all snowball formated files about specifying format option

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1524809 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2013-09-19 19:06:37 +00:00
parent c7188a03a3
commit 499bd3688a
40 changed files with 155 additions and 4 deletions

View File

@ -78,6 +78,12 @@ API Changes:
with IndexSearcher when an ExecutorService is specified.
(Ryan Ernst, Mike McCandless, Robert Muir)
Documentation
* LUCENE-5211: Better javadocs and error checking of 'format' option in
StopFilterFactory, as well as comments in all snowball formated files
about specifying format option. (hossman)
Changes in backwards compatibility policy
* LUCENE-5204: Directory doesn't have default implementations for

View File

@ -22,22 +22,57 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.WordlistLoader; // jdocs
import java.util.Map;
import java.io.IOException;
/**
* Factory for {@link StopFilter}.
*
* <pre class="prettyprint">
* &lt;fieldType name="text_stop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.StopFilterFactory" ignoreCase="true"
* words="stopwords.txt"
* words="stopwords.txt" format="wordset"
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* <p>
* All attributes are optional:
* </p>
* <ul>
* <li><code>ignoreCase</code> defaults to <code>false</code></li>
* <li><code>words</code> should be the name of a stopwords file to parse, if not
* specified the factory will use {@link StopAnalyzer#ENGLISH_STOP_WORDS_SET}
* </li>
* <li><code>format</code> defines how the <code>words</code> file will be parsed,
* and defaults to <code>wordset</code>. If <code>words</code> is not specified,
* then <code>format</code> must not be specified.
* </li>
* </ul>
* <p>
* The valid values for the <code>format</code> option are:
* </p>
* <ul>
* <li><code>wordset</code> - This is the default format, which supports one word per
* line (including any intra-word whitespace) and allows whole line comments
* begining with the "#" character. Blank lines are ignored. See
* {@link WordlistLoader#getLines WordlistLoader.getLines} for details.
* </li>
* <li><code>snowball</code> - This format allows for multiple words specified on each
* line, and trailing comments may be specified using the vertical line ("&#124;").
* Blank lines are ignored. See
* {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
* for details.
* </li>
* </ul>
*/
public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String FORMAT_WORDSET = "wordset";
public static final String FORMAT_SNOWBALL = "snowball";
private CharArraySet stopWords;
private final String stopWordFiles;
private final String format;
@ -48,7 +83,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
super(args);
assureMatchVersion();
stopWordFiles = get(args, "words");
format = get(args, "format");
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@ -58,12 +93,17 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
@Override
public void inform(ResourceLoader loader) throws IOException {
if (stopWordFiles != null) {
if ("snowball".equalsIgnoreCase(format)) {
if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
} else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
} else {
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format);
}
} else {
if (null != format) {
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
}
stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
}
}

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| An English stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| forms of BE

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| Hungarian stop word list
| prepared by Anna Tordai

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| An Italian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -57,6 +57,11 @@ public class TestStopFilterFactory extends BaseTokenStreamFactoryTestCase {
assertTrue(words.contains("her"));
assertTrue(words.contains("hers"));
assertTrue(words.contains("herself"));
// defaults
factory = (StopFilterFactory) tokenFilterFactory("Stop");
assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
assertEquals(false, factory.isIgnoreCase());
}
/** Test that bogus arguments result in exception */
@ -68,4 +73,30 @@ public class TestStopFilterFactory extends BaseTokenStreamFactoryTestCase {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
/** Test that bogus arguments result in exception */
public void testBogusFormats() throws Exception {
try {
tokenFilterFactory("Stop",
"words", "stop-snowball.txt",
"format", "bogus");
fail();
} catch (IllegalArgumentException expected) {
String msg = expected.getMessage();
assertTrue(msg, msg.contains("Unknown"));
assertTrue(msg, msg.contains("format"));
assertTrue(msg, msg.contains("bogus"));
}
try {
tokenFilterFactory("Stop",
// implicit default words file
"format", "bogus");
fail();
} catch (IllegalArgumentException expected) {
String msg = expected.getMessage();
assertTrue(msg, msg.contains("can not be specified"));
assertTrue(msg, msg.contains("format"));
assertTrue(msg, msg.contains("bogus"));
}
}
}

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| forms of BE

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| Hungarian stop word list
| prepared by Anna Tordai

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| An Italian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| forms of BE

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| Hungarian stop word list
| prepared by Anna Tordai

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| An Italian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.

View File

@ -4,6 +4,8 @@
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.