mirror of https://github.com/apache/lucene.git
LUCENE-5820: SuggestStopFilter should have a factory
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1625193 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a0e0d79233
commit
16c7802f29
|
@ -7,6 +7,7 @@
|
|||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/resources" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
|
|
|
@ -127,6 +127,9 @@ New Features
|
|||
Directory.renameFile so that in-progress commits are never visible.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-5820: SuggestStopFilter should have a factory.
|
||||
(Varun Thacker via Steve Rowe)
|
||||
|
||||
API Changes:
|
||||
|
||||
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader; // jdocs
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Factory for {@link SuggestStopFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="autosuggest" class="solr.TextField"
|
||||
* positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.SuggestStopFilterFactory" ignoreCase="true"
|
||||
* words="stopwords.txt" format="wordset"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* <p>
|
||||
* All attributes are optional:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li><code>ignoreCase</code> defaults to <code>false</code></li>
|
||||
* <li><code>words</code> should be the name of a stopwords file to parse, if not
|
||||
* specified the factory will use {@link StopAnalyzer#ENGLISH_STOP_WORDS_SET}
|
||||
* </li>
|
||||
* <li><code>format</code> defines how the <code>words</code> file will be parsed,
|
||||
* and defaults to <code>wordset</code>. If <code>words</code> is not specified,
|
||||
* then <code>format</code> must not be specified.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* The valid values for the <code>format</code> option are:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li><code>wordset</code> - This is the default format, which supports one word per
|
||||
* line (including any intra-word whitespace) and allows whole line comments
|
||||
* begining with the "#" character. Blank lines are ignored. See
|
||||
* {@link WordlistLoader#getLines WordlistLoader.getLines} for details.
|
||||
* </li>
|
||||
* <li><code>snowball</code> - This format allows for multiple words specified on each
|
||||
* line, and trailing comments may be specified using the vertical line ("|").
|
||||
* Blank lines are ignored. See
|
||||
* {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
|
||||
* for details.
|
||||
* </li>
|
||||
* </ul>
|
||||
*/
|
||||
public class SuggestStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String FORMAT_WORDSET = "wordset";
|
||||
public static final String FORMAT_SNOWBALL = "snowball";
|
||||
|
||||
private CharArraySet stopWords;
|
||||
private final String stopWordFiles;
|
||||
private final String format;
|
||||
private final boolean ignoreCase;
|
||||
|
||||
/** Creates a new StopFilterFactory */
|
||||
public SuggestStopFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
stopWordFiles = get(args, "words");
|
||||
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
|
||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
if (stopWordFiles != null) {
|
||||
if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
|
||||
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||
} else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
|
||||
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format);
|
||||
}
|
||||
} else {
|
||||
if (null != format) {
|
||||
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
|
||||
}
|
||||
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public CharArraySet getStopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
SuggestStopFilter suggestStopFilter = new SuggestStopFilter(input, stopWords);
|
||||
return suggestStopFilter;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.search.suggest.analyzing.SuggestStopFilterFactory
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestSuggestStopFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testInform() throws Exception {
|
||||
ResourceLoader loader = new ClasspathResourceLoader(getClass());
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
SuggestStopFilterFactory factory = createFactory(
|
||||
"words", "stop-1.txt",
|
||||
"ignoreCase", "true");
|
||||
CharArraySet words = factory.getStopWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||
|
||||
factory = createFactory("words", "stop-1.txt, stop-2.txt",
|
||||
"ignoreCase", "true");
|
||||
words = factory.getStopWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||
|
||||
factory = createFactory("words", "stop-snowball.txt",
|
||||
"format", "snowball",
|
||||
"ignoreCase", "true");
|
||||
words = factory.getStopWords();
|
||||
assertEquals(8, words.size());
|
||||
assertTrue(words.contains("he"));
|
||||
assertTrue(words.contains("him"));
|
||||
assertTrue(words.contains("his"));
|
||||
assertTrue(words.contains("himself"));
|
||||
assertTrue(words.contains("she"));
|
||||
assertTrue(words.contains("her"));
|
||||
assertTrue(words.contains("hers"));
|
||||
assertTrue(words.contains("herself"));
|
||||
|
||||
// defaults
|
||||
factory = createFactory();
|
||||
assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
|
||||
assertEquals(false, factory.isIgnoreCase());
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
createFactory("bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusFormats() throws Exception {
|
||||
try {
|
||||
createFactory("words", "stop-snowball.txt",
|
||||
"format", "bogus");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
String msg = expected.getMessage();
|
||||
assertTrue(msg, msg.contains("Unknown"));
|
||||
assertTrue(msg, msg.contains("format"));
|
||||
assertTrue(msg, msg.contains("bogus"));
|
||||
}
|
||||
try {
|
||||
createFactory(
|
||||
// implicit default words file
|
||||
"format", "bogus");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
String msg = expected.getMessage();
|
||||
assertTrue(msg, msg.contains("can not be specified"));
|
||||
assertTrue(msg, msg.contains("format"));
|
||||
assertTrue(msg, msg.contains("bogus"));
|
||||
}
|
||||
}
|
||||
|
||||
private SuggestStopFilterFactory createFactory(String ... params) throws IOException {
|
||||
if(params.length%2 != 0) {
|
||||
throw new IllegalArgumentException("invalid keysAndValues map");
|
||||
}
|
||||
Map<String, String> args = new HashMap<>(params.length/2);
|
||||
for(int i=0; i<params.length; i+=2) {
|
||||
String previous = args.put(params[i], params[i+1]);
|
||||
assertNull("duplicate values for key: " + params[i], previous);
|
||||
}
|
||||
args.put("luceneMatchVersion", Version.LATEST.toString());
|
||||
|
||||
SuggestStopFilterFactory factory = new SuggestStopFilterFactory(args);
|
||||
factory.inform(new ClasspathResourceLoader(getClass()));
|
||||
return factory;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
foo
|
||||
bar
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
junk
|
||||
more
|
|
@ -0,0 +1,10 @@
|
|||
| This is a file in snowball format, empty lines are ignored, '|' is a comment
|
||||
| Additionally, multiple words can be on the same line, allowing stopwords to be
|
||||
| arranged in tables (useful in some languages where they might inflect)
|
||||
|
||||
| fictitious table below
|
||||
|
||||
|third person singular
|
||||
|Subject Object Possessive Reflexive
|
||||
he him his himself| masculine
|
||||
she her hers herself| feminine
|
Loading…
Reference in New Issue