LUCENE-5820: SuggestStopFilter should have a factory

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1625193 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-09-16 00:21:29 +00:00
parent a0e0d79233
commit 16c7802f29
8 changed files with 311 additions and 0 deletions

View File

@ -7,6 +7,7 @@
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/resources" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />

View File

@ -127,6 +127,9 @@ New Features
Directory.renameFile so that in-progress commits are never visible.
(Robert Muir)
* LUCENE-5820: SuggestStopFilter should have a factory.
(Varun Thacker via Steve Rowe)
API Changes:
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and

View File

@ -0,0 +1,126 @@
package org.apache.lucene.search.suggest.analyzing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.WordlistLoader; // jdocs
import java.util.Map;
import java.io.IOException;
/**
* Factory for {@link SuggestStopFilter}.
*
* <pre class="prettyprint">
* &lt;fieldType name="autosuggest" class="solr.TextField"
* positionIncrementGap="100" autoGeneratePhraseQueries="true"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.SuggestStopFilterFactory" ignoreCase="true"
* words="stopwords.txt" format="wordset"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* <p>
* All attributes are optional:
* </p>
* <ul>
* <li><code>ignoreCase</code> defaults to <code>false</code></li>
* <li><code>words</code> should be the name of a stopwords file to parse, if not
* specified the factory will use {@link StopAnalyzer#ENGLISH_STOP_WORDS_SET}
* </li>
* <li><code>format</code> defines how the <code>words</code> file will be parsed,
* and defaults to <code>wordset</code>. If <code>words</code> is not specified,
* then <code>format</code> must not be specified.
* </li>
* </ul>
* <p>
* The valid values for the <code>format</code> option are:
* </p>
* <ul>
* <li><code>wordset</code> - This is the default format, which supports one word per
* line (including any intra-word whitespace) and allows whole line comments
* begining with the "#" character. Blank lines are ignored. See
* {@link WordlistLoader#getLines WordlistLoader.getLines} for details.
* </li>
* <li><code>snowball</code> - This format allows for multiple words specified on each
* line, and trailing comments may be specified using the vertical line ("&#124;").
* Blank lines are ignored. See
* {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
* for details.
* </li>
* </ul>
*/
public class SuggestStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String FORMAT_WORDSET = "wordset";
public static final String FORMAT_SNOWBALL = "snowball";
private CharArraySet stopWords;
private final String stopWordFiles;
private final String format;
private final boolean ignoreCase;
/** Creates a new StopFilterFactory */
public SuggestStopFilterFactory(Map<String,String> args) {
super(args);
stopWordFiles = get(args, "words");
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
if (stopWordFiles != null) {
if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
} else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
} else {
throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format);
}
} else {
if (null != format) {
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
}
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public CharArraySet getStopWords() {
return stopWords;
}
@Override
public TokenStream create(TokenStream input) {
SuggestStopFilter suggestStopFilter = new SuggestStopFilter(input, stopWords);
return suggestStopFilter;
}
}

View File

@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.search.suggest.analyzing.SuggestStopFilterFactory

View File

@ -0,0 +1,121 @@
package org.apache.lucene.search.suggest.analyzing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.Version;
public class TestSuggestStopFilterFactory extends BaseTokenStreamTestCase {
public void testInform() throws Exception {
ResourceLoader loader = new ClasspathResourceLoader(getClass());
assertTrue("loader is null and it shouldn't be", loader != null);
SuggestStopFilterFactory factory = createFactory(
"words", "stop-1.txt",
"ignoreCase", "true");
CharArraySet words = factory.getStopWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
factory = createFactory("words", "stop-1.txt, stop-2.txt",
"ignoreCase", "true");
words = factory.getStopWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
factory = createFactory("words", "stop-snowball.txt",
"format", "snowball",
"ignoreCase", "true");
words = factory.getStopWords();
assertEquals(8, words.size());
assertTrue(words.contains("he"));
assertTrue(words.contains("him"));
assertTrue(words.contains("his"));
assertTrue(words.contains("himself"));
assertTrue(words.contains("she"));
assertTrue(words.contains("her"));
assertTrue(words.contains("hers"));
assertTrue(words.contains("herself"));
// defaults
factory = createFactory();
assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
assertEquals(false, factory.isIgnoreCase());
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
createFactory("bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
/** Test that bogus arguments result in exception */
public void testBogusFormats() throws Exception {
try {
createFactory("words", "stop-snowball.txt",
"format", "bogus");
fail();
} catch (IllegalArgumentException expected) {
String msg = expected.getMessage();
assertTrue(msg, msg.contains("Unknown"));
assertTrue(msg, msg.contains("format"));
assertTrue(msg, msg.contains("bogus"));
}
try {
createFactory(
// implicit default words file
"format", "bogus");
fail();
} catch (IllegalArgumentException expected) {
String msg = expected.getMessage();
assertTrue(msg, msg.contains("can not be specified"));
assertTrue(msg, msg.contains("format"));
assertTrue(msg, msg.contains("bogus"));
}
}
private SuggestStopFilterFactory createFactory(String ... params) throws IOException {
if(params.length%2 != 0) {
throw new IllegalArgumentException("invalid keysAndValues map");
}
Map<String, String> args = new HashMap<>(params.length/2);
for(int i=0; i<params.length; i+=2) {
String previous = args.put(params[i], params[i+1]);
assertNull("duplicate values for key: " + params[i], previous);
}
args.put("luceneMatchVersion", Version.LATEST.toString());
SuggestStopFilterFactory factory = new SuggestStopFilterFactory(args);
factory.inform(new ClasspathResourceLoader(getClass()));
return factory;
}
}

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
foo
bar

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
junk
more

View File

@ -0,0 +1,10 @@
| This is a file in snowball format, empty lines are ignored, '|' is a comment
| Additionally, multiple words can be on the same line, allowing stopwords to be
| arranged in tables (useful in some languages where they might inflect)
| fictitious table below
|third person singular
|Subject Object Possessive Reflexive
he him his himself| masculine
she her hers herself| feminine