mirror of https://github.com/apache/lucene.git
LUCENE-4542: Make HunspellStemFilter's maximum recursion level configurable.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1504529 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
97fb0209ce
commit
59a1435742
|
@ -72,6 +72,9 @@ API Changes
|
|||
* LUCENE-5114: Remove unused boolean useCache parameter from
|
||||
TermsEnum.seekCeil and .seekExact (Mike McCandless)
|
||||
|
||||
* LUCENE-4542: HunspellStemFilter's maximum recursion level is now configurable.
|
||||
(Piotr, Rafał Kuć via Adrien Grand)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5088: Added TermFilter to filter docs by a specific term.
|
||||
|
|
|
@ -55,17 +55,31 @@ public final class HunspellStemFilter extends TokenFilter {
|
|||
|
||||
private final boolean dedup;
|
||||
|
||||
/** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
|
||||
* recursion level of 2.
|
||||
* @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
|
||||
this(input, dictionary, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* HunspellDictionary
|
||||
*
|
||||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
*/
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
|
||||
this(input, dictionary, true);
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
|
||||
this(input, dictionary, true, recursionCap);
|
||||
}
|
||||
|
||||
|
||||
/** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2.
|
||||
* @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
|
||||
this(input, dictionary, dedup, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* HunspellDictionary
|
||||
|
@ -73,11 +87,12 @@ public final class HunspellStemFilter extends TokenFilter {
|
|||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param dedup true if only unique terms should be output.
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
*/
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
|
||||
super(input);
|
||||
this.dedup = dedup;
|
||||
this.stemmer = new HunspellStemmer(dictionary);
|
||||
this.stemmer = new HunspellStemmer(dictionary, recursionCap);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -54,12 +54,14 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
|
|||
private static final String PARAM_AFFIX = "affix";
|
||||
private static final String PARAM_IGNORE_CASE = "ignoreCase";
|
||||
private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
|
||||
private static final String PARAM_RECURSION_CAP = "recursionCap";
|
||||
|
||||
private final String dictionaryArg;
|
||||
private final String affixFile;
|
||||
private final boolean ignoreCase;
|
||||
private final boolean strictAffixParsing;
|
||||
private HunspellDictionary dictionary;
|
||||
private int recursionCap;
|
||||
|
||||
/** Creates a new HunspellStemFilterFactory */
|
||||
public HunspellStemFilterFactory(Map<String,String> args) {
|
||||
|
@ -69,6 +71,7 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
|
|||
affixFile = get(args, PARAM_AFFIX);
|
||||
ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
|
||||
strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
|
||||
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -111,6 +114,6 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
|
|||
*/
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new HunspellStemFilter(tokenStream, dictionary);
|
||||
return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,16 +17,10 @@ package org.apache.lucene.analysis.hunspell;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
|
@ -37,22 +31,32 @@ import org.apache.lucene.util.Version;
|
|||
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
|
||||
*/
|
||||
public class HunspellStemmer {
|
||||
|
||||
private static final int RECURSION_CAP = 2;
|
||||
|
||||
private final int recursionCap;
|
||||
private final HunspellDictionary dictionary;
|
||||
private final StringBuilder segment = new StringBuilder();
|
||||
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
|
||||
|
||||
/**
|
||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
|
||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
|
||||
* default recursion cap of <code>2</code> (based on Hunspell documentation).
|
||||
*
|
||||
* @param dictionary HunspellDictionary that will be used to create the stems
|
||||
*/
|
||||
public HunspellStemmer(HunspellDictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
this(dictionary, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
|
||||
*
|
||||
* @param dictionary HunspellDictionary that will be used to create the stems
|
||||
* @param recursionCap maximum level of recursion stemmer can go into
|
||||
*/
|
||||
public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
|
||||
this.dictionary = dictionary;
|
||||
this.recursionCap = recursionCap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the stem(s) of the provided word
|
||||
*
|
||||
|
@ -194,7 +198,7 @@ public class HunspellStemmer {
|
|||
}
|
||||
}
|
||||
|
||||
if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
|
||||
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
||||
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
|
@ -57,13 +58,13 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testKeywordAttribute() throws IOException {
|
||||
MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
|
||||
tokenizer.setEnableChecks(true);
|
||||
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY);
|
||||
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3));
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
|
||||
|
||||
// assert with keywork marker
|
||||
tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
|
||||
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY);
|
||||
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, _TestUtil.nextInt(random(), 1, 3));
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||
}
|
||||
|
||||
|
@ -74,7 +75,7 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -85,7 +86,7 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
|
|
Loading…
Reference in New Issue