SOLR-1857: cleanup and sync analysis with Lucene trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@929782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-04-01 02:15:27 +00:00
parent a528a707c1
commit 3860c16a66
124 changed files with 771 additions and 1264 deletions

View File

@ -126,6 +126,14 @@ New Features
* SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble) * SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble)
* SOLR-1857: Synced Solr analysis with Lucene 3.1. Added KeywordMarkerFilterFactory
and StemmerOverrideFilterFactory, which can be used to tune stemming algorithms.
Added factories for Bulgarian, Czech, Hindi, and Turkish analysis. Improved the
performance of SnowballPorterFilterFactory. (rmuir)
* SOLR-1657: Converted remaining TokenStreams to the Attributes-based API. All Solr
TokenFilters now support custom Attributes, and some have improved performance:
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -18,9 +18,10 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.ASCIIFoldingFilter;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.analysis.TokenStream;
import java.util.Map;
/** Factory for {@link ASCIIFoldingFilter} */
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory { public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
public ASCIIFoldingFilter create(TokenStream input) { public ASCIIFoldingFilter create(TokenStream input) {
return new ASCIIFoldingFilter(input); return new ASCIIFoldingFilter(input);

View File

@ -16,15 +16,13 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import java.io.Reader; import java.io.Reader;
/** /**
* * Factory for {@link ArabicLetterTokenizer}
*
**/ **/
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{ public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{

View File

@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
/** /**
* * Factory for {@link ArabicNormalizationFilter}
*
**/ **/
public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{ public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{

View File

@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicStemFilter;
/** /**
* * Factory for {@link ArabicStemFilter}
*
**/ **/
public class ArabicStemFilterFactory extends BaseTokenFilterFactory{ public class ArabicStemFilterFactory extends BaseTokenFilterFactory{

View File

@ -17,13 +17,17 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.Config; import org.apache.solr.core.Config;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import java.io.IOException;
import java.util.List;
import java.util.Map; import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -94,4 +98,22 @@ abstract class BaseTokenStreamFactory {
return Boolean.parseBoolean(s); return Boolean.parseBoolean(s);
} }
protected CharArraySet getWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
assureMatchVersion();
List<String> files = StrUtils.splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
ignoreCase));
}
}
return words;
}
} }

View File

@ -18,15 +18,10 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.br.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.io.IOException; import org.apache.lucene.analysis.br.BrazilianStemFilter;
import java.util.HashSet;
import java.util.Hashtable; /** Factory for {@link BrazilianStemFilter} */
import java.util.Set;
import java.util.Map;
public class BrazilianStemFilterFactory extends BaseTokenFilterFactory { public class BrazilianStemFilterFactory extends BaseTokenFilterFactory {
public BrazilianStemFilter create(TokenStream in) { public BrazilianStemFilter create(TokenStream in) {
return new BrazilianStemFilter(in); return new BrazilianStemFilter(in);

View File

@ -73,12 +73,12 @@ public abstract class BufferedTokenStream extends TokenFilter {
private final LinkedList<Token> inQueue = new LinkedList<Token>(); private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>(); private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public BufferedTokenStream(TokenStream input) { public BufferedTokenStream(TokenStream input) {
super(input); super(input);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.bg.BulgarianStemFilter; import org.apache.lucene.analysis.bg.BulgarianStemFilter;
/** Factory for BulgarianStemFilter */ /** Factory for {@link BulgarianStemFilter} */
public class BulgarianStemFilterFactory extends BaseTokenFilterFactory { public class BulgarianStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new BulgarianStemFilter(input); return new BulgarianStemFilter(input);

View File

@ -18,11 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.cjk.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** Factory for {@link CJKTokenizer} */
public class CJKTokenizerFactory extends BaseTokenizerFactory { public class CJKTokenizerFactory extends BaseTokenizerFactory {
public CJKTokenizer create(Reader in) { public CJKTokenizer create(Reader in) {
return new CJKTokenizer(in); return new CJKTokenizer(in);

View File

@ -75,6 +75,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
@Override @Override
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
super.init(args); super.init(args);
assureMatchVersion();
String k = args.get(KEEP); String k = args.get(KEEP);
if (k != null) { if (k != null) {
@ -84,7 +85,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
if ("true".equalsIgnoreCase(ignoreStr)) { if ("true".equalsIgnoreCase(ignoreStr)) {
ignoreCase = true; ignoreCase = true;
} }
keep = new CharArraySet(10, ignoreCase); keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
while (st.hasMoreTokens()) { while (st.hasMoreTokens()) {
k = st.nextToken().trim(); k = st.nextToken().trim();
keep.add(k.toCharArray()); keep.add(k.toCharArray());
@ -194,7 +195,7 @@ class CapitalizationFilter extends TokenFilter {
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) { public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
super(in); super(in);
this.factory = factory; this.factory = factory;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
} }
@Override @Override

View File

@ -18,10 +18,14 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.cn.*; import org.apache.lucene.analysis.TokenStream;
import java.util.Hashtable; import org.apache.lucene.analysis.cn.ChineseFilter;
import org.apache.lucene.analysis.*;
import java.util.Map; /**
* Factory for {@link ChineseFilter}
* @deprecated Use {@link StopFilterFactory} instead.
*/
@Deprecated
public class ChineseFilterFactory extends BaseTokenFilterFactory { public class ChineseFilterFactory extends BaseTokenFilterFactory {
public ChineseFilter create(TokenStream in) { public ChineseFilter create(TokenStream in) {
return new ChineseFilter(in); return new ChineseFilter(in);

View File

@ -18,10 +18,15 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.cn.*;
import java.io.Reader; import java.io.Reader;
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.cn.ChineseTokenizer;
import java.util.Map;
/**
* Factory for {@link ChineseTokenizer}
* @deprecated Use {@link StandardTokenizerFactory} instead.
*/
@Deprecated
public class ChineseTokenizerFactory extends BaseTokenizerFactory { public class ChineseTokenizerFactory extends BaseTokenizerFactory {
public ChineseTokenizer create(Reader in) { public ChineseTokenizer create(Reader in) {
return new ChineseTokenizer(in); return new ChineseTokenizer(in);

View File

@ -20,6 +20,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
/* /*
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
@ -51,15 +52,25 @@ public final class CommonGramsFilter extends TokenFilter {
private final StringBuilder buffer = new StringBuilder(); private final StringBuilder buffer = new StringBuilder();
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class); private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private int lastStartOffset; private int lastStartOffset;
private boolean lastWasCommon; private boolean lastWasCommon;
private State savedState; private State savedState;
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */
public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
this(Version.LUCENE_29, input, commonWords);
}
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */
public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean ignoreCase) {
this(Version.LUCENE_29, input, commonWords, ignoreCase);
}
/** /**
* Construct a token stream filtering the given input using a Set of common * Construct a token stream filtering the given input using a Set of common
* words to create bigrams. Outputs both unigrams with position increment and * words to create bigrams. Outputs both unigrams with position increment and
@ -69,8 +80,8 @@ public final class CommonGramsFilter extends TokenFilter {
* @param input TokenStream input in filter chain * @param input TokenStream input in filter chain
* @param commonWords The set of common words. * @param commonWords The set of common words.
*/ */
public CommonGramsFilter(TokenStream input, Set commonWords) { public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
this(input, commonWords, false); this(matchVersion, input, commonWords, false);
} }
/** /**
@ -90,12 +101,12 @@ public final class CommonGramsFilter extends TokenFilter {
* @param commonWords The set of common words. * @param commonWords The set of common words.
* @param ignoreCase -Ignore case when constructing bigrams for common words. * @param ignoreCase -Ignore case when constructing bigrams for common words.
*/ */
public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) { public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
super(input); super(input);
if (commonWords instanceof CharArraySet) { if (commonWords instanceof CharArraySet) {
this.commonWords = (CharArraySet) commonWords; this.commonWords = (CharArraySet) commonWords;
} else { } else {
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase); this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
this.commonWords.addAll(commonWords); this.commonWords.addAll(commonWords);
} }
} }
@ -106,7 +117,9 @@ public final class CommonGramsFilter extends TokenFilter {
* *
* @param input Tokenstream in filter chain * @param input Tokenstream in filter chain
* @param commonWords words to be used in constructing bigrams * @param commonWords words to be used in constructing bigrams
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
*/ */
@Deprecated
public CommonGramsFilter(TokenStream input, String[] commonWords) { public CommonGramsFilter(TokenStream input, String[] commonWords) {
this(input, commonWords, false); this(input, commonWords, false);
} }
@ -118,7 +131,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param input Tokenstream in filter chain * @param input Tokenstream in filter chain
* @param commonWords words to be used in constructing bigrams * @param commonWords words to be used in constructing bigrams
* @param ignoreCase -Ignore case when constructing bigrams for common words. * @param ignoreCase -Ignore case when constructing bigrams for common words.
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead.
*/ */
@Deprecated
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) { public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
super(input); super(input);
this.commonWords = makeCommonSet(commonWords, ignoreCase); this.commonWords = makeCommonSet(commonWords, ignoreCase);
@ -132,7 +147,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param commonWords Array of common words which will be converted into the CharArraySet * @param commonWords Array of common words which will be converted into the CharArraySet
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor * @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase * @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
* @deprecated create a CharArraySet with CharArraySet instead
*/ */
@Deprecated
public static CharArraySet makeCommonSet(String[] commonWords) { public static CharArraySet makeCommonSet(String[] commonWords) {
return makeCommonSet(commonWords, false); return makeCommonSet(commonWords, false);
} }
@ -145,7 +162,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param commonWords Array of common words which will be converted into the CharArraySet * @param commonWords Array of common words which will be converted into the CharArraySet
* @param ignoreCase If true, all words are lower cased first. * @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words * @return a Set containing the words
* @deprecated create a CharArraySet with CharArraySet instead
*/ */
@Deprecated
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) { public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase); CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
commonSet.addAll(Arrays.asList(commonWords)); commonSet.addAll(Arrays.asList(commonWords));

View File

@ -17,14 +17,12 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
/** /**
@ -43,16 +41,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
if (commonWordFiles != null) { if (commonWordFiles != null) {
try { try {
List<String> files = StrUtils.splitFileNames(commonWordFiles); commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
if (commonWords == null && files.size() > 0){
//default stopwords list has 35 or so words, but maybe don't make it that big to start
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -69,12 +58,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
return ignoreCase; return ignoreCase;
} }
public Set getCommonWords() { public Set<?> getCommonWords() {
return commonWords; return commonWords;
} }
public CommonGramsFilter create(TokenStream input) { public CommonGramsFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase); CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase);
return commonGrams; return commonGrams;
} }
} }

View File

@ -47,8 +47,8 @@ import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
*/ */
public final class CommonGramsQueryFilter extends TokenFilter { public final class CommonGramsQueryFilter extends TokenFilter {
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class); private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private State previous; private State previous;
private String previousType; private String previousType;

View File

@ -17,14 +17,13 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
/** /**
@ -36,25 +35,19 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
implements ResourceLoaderAware { implements ResourceLoaderAware {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String commonWordFiles = args.get("words"); String commonWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false); ignoreCase = getBoolean("ignoreCase", false);
if (commonWordFiles != null) { if (commonWordFiles != null) {
try { try {
List<String> files = StrUtils.splitFileNames(commonWordFiles); commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
if (commonWords == null && files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it
// that big to start
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
// TODO: once StopFilter.makeStopSet(List) method is available, switch
// to using that so we can avoid a toArray() call
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
.toArray(new String[0]), ignoreCase));
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -73,7 +66,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
return ignoreCase; return ignoreCase;
} }
public Set getCommonWords() { public Set<?> getCommonWords() {
return commonWords; return commonWords;
} }
@ -81,7 +74,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
*/ */
public CommonGramsQueryFilter create(TokenStream input) { public CommonGramsQueryFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords,
ignoreCase); ignoreCase);
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
commonGrams); commonGrams);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.cz.CzechStemFilter;
/** Factory for CzechStemFilter */ /** Factory for {@link CzechStemFilter} */
public class CzechStemFilterFactory extends BaseTokenFilterFactory { public class CzechStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new CzechStemFilter(input); return new CzechStemFilter(input);

View File

@ -31,7 +31,7 @@ import java.util.Map;
/** /**
* *
* * Factory for {@link DelimitedPayloadTokenFilter}
**/ **/
public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String ENCODER_ATTR = "encoder"; public static final String ENCODER_ATTR = "encoder";

View File

@ -18,20 +18,18 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.compound.*; import org.apache.lucene.analysis.compound.*;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.util.List;
import java.util.Set;
import java.util.Map; import java.util.Map;
import java.io.IOException; import java.io.IOException;
/** Factory for {@link DictionaryCompoundWordTokenFilter} */
public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Set dictionary; private CharArraySet dictionary;
private String dictFile; private String dictFile;
private int minWordSize; private int minWordSize;
private int minSubwordSize; private int minSubwordSize;
@ -39,6 +37,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
private boolean onlyLongestMatch; private boolean onlyLongestMatch;
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
super.init(args); super.init(args);
assureMatchVersion();
dictFile = args.get("dictionary"); dictFile = args.get("dictionary");
if (null == dictFile) { if (null == dictFile) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
@ -52,14 +51,13 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
} }
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
try { try {
List<String> wlist = loader.getLines(dictFile); dictionary = super.getWordSet(loader, dictFile, false);
dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public DictionaryCompoundWordTokenFilter create(TokenStream input) { public DictionaryCompoundWordTokenFilter create(TokenStream input) {
return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch); return new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
} }
} }

View File

@ -20,11 +20,9 @@ import java.io.IOException;
import java.util.LinkedList; import java.util.LinkedList;
import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class DoubleMetaphoneFilter extends TokenFilter { public class DoubleMetaphoneFilter extends TokenFilter {
@ -41,8 +39,8 @@ public class DoubleMetaphoneFilter extends TokenFilter {
super(input); super(input);
this.encoder.setMaxCodeLen(maxCodeLength); this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject; this.inject = inject;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); this.posAtt = addAttribute(PositionIncrementAttribute.class);
} }
@Override @Override

View File

@ -18,19 +18,19 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.nl.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.HashMap; /**
import java.util.HashSet; * @deprecated Use {@link SnowballPorterFilterFactory} with "Dutch" instead,
import java.util.Set; * which has the same functionality.
import java.util.Map; */
import java.util.Map; @Deprecated
public class DutchStemFilterFactory extends BaseTokenFilterFactory { public class DutchStemFilterFactory extends BaseTokenFilterFactory {
public DutchStemFilter create(TokenStream _in) { public TokenFilter create(TokenStream _in) {
return new DutchStemFilter(_in); return new SnowballFilter(_in, new org.tartarus.snowball.ext.DutchStemmer());
} }
} }

View File

@ -21,32 +21,22 @@ package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.fr.*; import org.apache.lucene.analysis.fr.*;
import java.io.IOException; import java.io.IOException;
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import java.util.Map;
import java.util.List;
import java.util.Set;
import java.io.IOException;
/** Factory for {@link ElisionFilter} */
public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Set articles; private CharArraySet articles;
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String articlesFile = args.get("articles"); String articlesFile = args.get("articles");
if (articlesFile != null) { if (articlesFile != null) {
try { try {
List<String> wlist = loader.getLines(articlesFile); articles = getWordSet(loader, articlesFile, false);
articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }

View File

@ -18,17 +18,14 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram;
import java.io.IOException; import java.io.IOException;
import java.io.File;
import java.util.List;
/** /**
* @version $Id$ * @version $Id$
@ -42,21 +39,7 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
String wordFiles = args.get(PROTECTED_TOKENS); String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) { if (wordFiles != null) {
try { try {
File protectedWordFiles = new File(wordFiles); protectedWords = getWordSet(loader, wordFiles, false);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -65,20 +48,10 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
private CharArraySet protectedWords = null; private CharArraySet protectedWords = null;
public EnglishPorterFilter create(TokenStream input) { public TokenFilter create(TokenStream input) {
return new EnglishPorterFilter(input, protectedWords); if (protectedWords != null)
input = new KeywordMarkerTokenFilter(input, protectedWords);
return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer());
} }
} }
/**
* English Porter2 filter that doesn't use reflection to
* adapt lucene to the snowball stemmer code.
*/
@Deprecated
class EnglishPorterFilter extends SnowballPorterFilter {
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords);
}
}

View File

@ -18,18 +18,19 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.fr.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable; /**
import java.util.HashSet; * @deprecated Use {@link SnowballPorterFilterFactory} with "French" instead,
import java.util.Set; * which has the same functionality.
import java.util.Map; */
@Deprecated
public class FrenchStemFilterFactory extends BaseTokenFilterFactory { public class FrenchStemFilterFactory extends BaseTokenFilterFactory {
public FrenchStemFilter create(TokenStream in) { public TokenFilter create(TokenStream in) {
return new FrenchStemFilter(in); return new SnowballFilter(in, new org.tartarus.snowball.ext.FrenchStemmer());
} }
} }

View File

@ -18,13 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.de.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Set; /** Factory for {@link GermanStemFilter} */
import java.util.Map;
public class GermanStemFilterFactory extends BaseTokenFilterFactory { public class GermanStemFilterFactory extends BaseTokenFilterFactory {
public GermanStemFilter create(TokenStream in) { public GermanStemFilter create(TokenStream in) {
return new GermanStemFilter(in); return new GermanStemFilter(in);

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException.ErrorCode;
/** Factory for {@link GreekLowerCaseFilter} */
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
{ {

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter; import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
/** Factory for HindiNormalizationFilter */ /** Factory for {@link HindiNormalizationFilter} */
public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory { public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new HindiNormalizationFilter(input); return new HindiNormalizationFilter(input);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hi.HindiStemFilter; import org.apache.lucene.analysis.hi.HindiStemFilter;
/** Factory for HindiStemFilter */ /** Factory for {@link HindiStemFilter} */
public class HindiStemFilterFactory extends BaseTokenFilterFactory { public class HindiStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new HindiStemFilter(input); return new HindiStemFilter(input);

View File

@ -54,8 +54,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*/ */
public final class HyphenatedWordsFilter extends TokenFilter { public final class HyphenatedWordsFilter extends TokenFilter {
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final StringBuilder hyphenated = new StringBuilder(); private final StringBuilder hyphenated = new StringBuilder();
private State savedState; private State savedState;

View File

@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenFilterFactory; import org.apache.solr.analysis.BaseTokenFilterFactory;
/** /**
* Factory for HyphenatedWordsFilter * Factory for {@link HyphenatedWordsFilter}
*/ */
public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory { public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
public HyphenatedWordsFilter create(TokenStream input) { public HyphenatedWordsFilter create(TokenStream input) {

View File

@ -21,8 +21,10 @@ import org.apache.lucene.analysis.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
/** Factory for ISOLatin1AccentFilter /** Factory for ISOLatin1AccentFilter
* @deprecated Use {@link ASCIIFoldingFilterFactory} instead.
* $Id$ * $Id$
*/ */
@Deprecated
public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory { public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory {
public ISOLatin1AccentFilter create(TokenStream input) { public ISOLatin1AccentFilter create(TokenStream input) {
return new ISOLatin1AccentFilter(input); return new ISOLatin1AccentFilter(input);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.in.IndicNormalizationFilter; import org.apache.lucene.analysis.in.IndicNormalizationFilter;
/** Factory for IndicNormalizationFilter */ /** Factory for {@link IndicNormalizationFilter} */
public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory { public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new IndicNormalizationFilter(input); return new IndicNormalizationFilter(input);

View File

@ -22,7 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.in.IndicTokenizer; import org.apache.lucene.analysis.in.IndicTokenizer;
/** Factory for IndicTokenizer */ /** Factory for {@link IndicTokenizer} */
public class IndicTokenizerFactory extends BaseTokenizerFactory { public class IndicTokenizerFactory extends BaseTokenizerFactory {
public Tokenizer create(Reader input) { public Tokenizer create(Reader input) {
assureMatchVersion(); assureMatchVersion();

View File

@ -19,10 +19,8 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.tartarus.snowball.SnowballProgram;
import java.io.IOException; import java.io.IOException;
import java.util.Set; import java.util.Set;
@ -38,6 +36,8 @@ public final class KeepWordFilter extends TokenFilter {
private final CharArraySet words; private final CharArraySet words;
private final TermAttribute termAtt; private final TermAttribute termAtt;
/** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
@Deprecated
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) { public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
this(in, new CharArraySet(words, ignoreCase)); this(in, new CharArraySet(words, ignoreCase));
} }
@ -47,7 +47,7 @@ public final class KeepWordFilter extends TokenFilter {
public KeepWordFilter(TokenStream in, CharArraySet words) { public KeepWordFilter(TokenStream in, CharArraySet words) {
super(in); super(in);
this.words = words; this.words = words;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
} }
@Override @Override

View File

@ -18,17 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
import java.util.Set; import java.util.Set;
import java.io.File;
import java.io.File;
import java.io.IOException; import java.io.IOException;
/** /**
@ -40,23 +34,13 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
private CharArraySet words; private CharArraySet words;
private boolean ignoreCase; private boolean ignoreCase;
@SuppressWarnings("unchecked")
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String wordFiles = args.get("words"); String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false); ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) { if (wordFiles != null) {
try { try {
List<String> files = StrUtils.splitFileNames(wordFiles); words = getWordSet(loader, wordFiles, ignoreCase);
if (words == null && files.size() > 0){ } catch (IOException e) {
words = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase));
}
}
catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
@ -67,14 +51,14 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
* NOTE: if ignoreCase==true, the words are expected to be lowercase * NOTE: if ignoreCase==true, the words are expected to be lowercase
*/ */
public void setWords(Set<String> words) { public void setWords(Set<String> words) {
this.words = new CharArraySet(words, ignoreCase); this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
} }
public void setIgnoreCase(boolean ignoreCase) { public void setIgnoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase; if (words != null && this.ignoreCase != ignoreCase) {
if (words != null) { words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
words = new CharArraySet(words, ignoreCase);
} }
this.ignoreCase = ignoreCase;
} }
public KeepWordFilter create(TokenStream input) { public KeepWordFilter create(TokenStream input) {

View File

@ -0,0 +1,55 @@
package org.apache.solr.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Factory for {@link KeywordMarkerTokenFilter}
*/
public class KeywordMarkerFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
private CharArraySet protectedWords;
private boolean ignoreCase;
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
try {
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public TokenStream create(TokenStream input) {
return protectedWords == null ? input : new KeywordMarkerTokenFilter(input, protectedWords);
}
}

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.KeywordTokenizer; import org.apache.lucene.analysis.KeywordTokenizer;
import java.io.Reader; import java.io.Reader;

View File

@ -17,17 +17,23 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.analysis.LetterTokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* @version $Id$ * @version $Id$
*/ */
public class LetterTokenizerFactory extends BaseTokenizerFactory { public class LetterTokenizerFactory extends BaseTokenizerFactory {
public LetterTokenizer create(Reader input) {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion(); assureMatchVersion();
}
public LetterTokenizer create(Reader input) {
return new LetterTokenizer(luceneMatchVersion, input); return new LetterTokenizer(luceneMatchVersion, input);
} }
} }

View File

@ -17,6 +17,8 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.LowerCaseFilter;
@ -24,8 +26,13 @@ import org.apache.lucene.analysis.LowerCaseFilter;
* @version $Id$ * @version $Id$
*/ */
public class LowerCaseFilterFactory extends BaseTokenFilterFactory { public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
public LowerCaseFilter create(TokenStream input) { @Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion(); assureMatchVersion();
}
public LowerCaseFilter create(TokenStream input) {
return new LowerCaseFilter(luceneMatchVersion,input); return new LowerCaseFilter(luceneMatchVersion,input);
} }
} }

View File

@ -17,17 +17,22 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseTokenizer; import org.apache.lucene.analysis.LowerCaseTokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* @version $Id$ * @version $Id$
*/ */
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory { public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
public LowerCaseTokenizer create(Reader input) { @Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion(); assureMatchVersion();
}
public LowerCaseTokenizer create(Reader input) {
return new LowerCaseTokenizer(luceneMatchVersion,input); return new LowerCaseTokenizer(luceneMatchVersion,input);
} }
} }

View File

@ -18,13 +18,12 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.payloads.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Payload;
import java.io.IOException;
import java.util.Map; import java.util.Map;
/** Factory for {@link NumericPayloadTokenFilter} */
public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory { public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
private float payload; private float payload;
private String typeMatch; private String typeMatch;

View File

@ -19,13 +19,10 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.Set;
import java.io.IOException; import java.io.IOException;
import java.nio.CharBuffer; import java.nio.CharBuffer;
@ -66,7 +63,7 @@ public final class PatternReplaceFilter extends TokenFilter {
this.p=p; this.p=p;
this.replacement = (null == replacement) ? "" : replacement; this.replacement = (null == replacement) ? "" : replacement;
this.all=all; this.all=all;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
} }
@Override @Override

View File

@ -56,8 +56,8 @@ import org.apache.commons.io.IOUtils;
*/ */
public final class PatternTokenizer extends Tokenizer { public final class PatternTokenizer extends Tokenizer {
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private String str; private String str;
private int index; private int index;

View File

@ -18,12 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.fa.*;
import java.io.IOException; import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.Map; /** Factory for {@link PersianNormalizationFilter} */
public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory { public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory {
public PersianNormalizationFilter create(TokenStream input) { public PersianNormalizationFilter create(TokenStream input) {
return new PersianNormalizationFilter(input); return new PersianNormalizationFilter(input);

View File

@ -20,7 +20,6 @@ package org.apache.solr.analysis;
import org.apache.commons.codec.Encoder; import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -47,8 +46,8 @@ public class PhoneticFilter extends TokenFilter
this.encoder = encoder; this.encoder = encoder;
this.name = name; this.name = name;
this.inject = inject; this.inject = inject;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); this.posAtt = addAttribute(PositionIncrementAttribute.class);
} }
@Override @Override

View File

@ -17,11 +17,12 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.util.CharArrayMap; import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
@ -30,12 +31,11 @@ import java.io.IOException;
*/ */
public final class RemoveDuplicatesTokenFilter extends TokenFilter { public final class RemoveDuplicatesTokenFilter extends TokenFilter {
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
// keep a seen 'set' after each term with posInc > 0 // use a fixed version, as we don't care about case sensitivity.
// for now use CharArrayMap vs CharArraySet, as it has clear() private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
/** /**
* Creates a new RemoveDuplicatesTokenFilter * Creates a new RemoveDuplicatesTokenFilter
@ -60,12 +60,12 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
previous.clear(); previous.clear();
} }
boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null); boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length));
// clone the term, and add to the set of seen terms. // clone the term, and add to the set of seen terms.
char saved[] = new char[length]; char saved[] = new char[length];
System.arraycopy(term, 0, saved, 0, length); System.arraycopy(term, 0, saved, 0, length);
previous.put(saved, Boolean.TRUE); previous.add(saved);
if (!duplicate) { if (!duplicate) {
return true; return true;

View File

@ -45,8 +45,8 @@ public class ReversedWildcardFilter extends TokenFilter {
protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) { protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
super(input); super(input);
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); this.posAtt = addAttribute(PositionIncrementAttribute.class);
this.withOriginal = withOriginal; this.withOriginal = withOriginal;
this.markerChar = markerChar; this.markerChar = markerChar;
} }

View File

@ -1,61 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//package org.apache.solr.analysis;
//import org.apache.lucene.analysis.ru.*;
//import java.util.Map;
//import java.util.HashMap;
//import org.apache.solr.core.SolrConfig;
//import org.apache.solr.common.SolrException;
//import org.apache.solr.common.SolrException.ErrorCode;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//@Deprecated
//public class RussianCommon {
//
// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
//
// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
// static {
// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
// CHARSETS.put("KOI8",RussianCharsets.KOI8);
// CHARSETS.put("CP1251",RussianCharsets.CP1251);
// }
//
// public static char[] getCharset(String name) {
// if (null == name)
// return RussianCharsets.UnicodeRussian;
//
// char[] charset = CHARSETS.get(name);
//
// if (charset.equals(RussianCharsets.UnicodeRussian))
// logger.warn("Specifying UnicodeRussian is no longer required (default). "
// + "Use of the charset parameter will cause an error in Solr 1.5");
// else
// logger.warn("Support for this custom encoding is deprecated. "
// + "Use of the charset parameter will cause an error in Solr 1.5");
//
// if (null == charset) {
// throw new SolrException(ErrorCode.SERVER_ERROR,
// "Don't understand charset: " + name);
// }
// return charset;
// }
//}

View File

@ -24,6 +24,10 @@ import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException.ErrorCode;
/** @deprecated Use {@link StandardTokenizerFactory} instead.
* This tokenizer has no Russian-specific functionality.
*/
@Deprecated
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory { public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
@Override @Override

View File

@ -19,11 +19,17 @@ package org.apache.solr.analysis;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter; import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException.ErrorCode;
/** @deprecated Use {@link LowerCaseFilterFactory} instead which has the
* same functionality.
*/
@Deprecated
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory { public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
@Override @Override
@ -35,8 +41,9 @@ public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
+ "Please process your documents as Unicode instead."); + "Please process your documents as Unicode instead.");
} }
public RussianLowerCaseFilter create(TokenStream in) { public TokenFilter create(TokenStream in) {
return new RussianLowerCaseFilter(in); // hardcode the version to give exactly the old behavior
return new LowerCaseFilter(Version.LUCENE_29, in);
} }
} }

View File

@ -19,16 +19,19 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.util.Map; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianStemFilter; import org.apache.lucene.analysis.snowball.SnowballFilter;
/**
* @deprecated Use {@link SnowballPorterFilterFactory} with "Russian" instead,
* which has the same functionality.
*/
@Deprecated
public class RussianStemFilterFactory extends BaseTokenFilterFactory { public class RussianStemFilterFactory extends BaseTokenFilterFactory {
public TokenFilter create(TokenStream in) {
public RussianStemFilter create(TokenStream in) { return new SnowballFilter(in, new org.tartarus.snowball.ext.RussianStemmer());
return new RussianStemFilter(in);
} }
} }

View File

@ -18,14 +18,12 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.shingle.*;
import java.io.IOException; import org.apache.lucene.analysis.shingle.ShingleFilter;
import java.util.LinkedList;
import java.util.Iterator;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.util.Map; import java.util.Map;
/** Factory for {@link ShingleFilter} */
public class ShingleFilterFactory extends BaseTokenFilterFactory { public class ShingleFilterFactory extends BaseTokenFilterFactory {
private int maxShingleSize; private int maxShingleSize;
private boolean outputUnigrams; private boolean outputUnigrams;

View File

@ -17,26 +17,21 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.util.Map; import java.util.Map;
import java.util.List;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.SnowballProgram;
/** /**
* Factory for SnowballFilters, with configurable language * Factory for {@link SnowballFilter}, with configurable language
* * <p>
* Browsing the code, SnowballFilter uses reflection to adapt to Lucene... don't * Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection.
* use this if you are concerned about speed. Use EnglishPorterFilterFactory.
* *
* @version $Id$ * @version $Id$
*/ */
@ -44,28 +39,14 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
public static final String PROTECTED_TOKENS = "protected"; public static final String PROTECTED_TOKENS = "protected";
private String language = "English"; private String language = "English";
private Class stemClass; private Class<?> stemClass;
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS); String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) { if (wordFiles != null) {
try { try {
File protectedWordFiles = new File(wordFiles); protectedWords = getWordSet(loader, wordFiles, false);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -87,50 +68,17 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
} }
} }
public SnowballPorterFilter create(TokenStream input) { public TokenFilter create(TokenStream input) {
SnowballProgram program; SnowballProgram program;
try { try {
program = (SnowballProgram)stemClass.newInstance(); program = (SnowballProgram)stemClass.newInstance();
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e); throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
} }
return new SnowballPorterFilter(input, program, protectedWords);
if (protectedWords != null)
input = new KeywordMarkerTokenFilter(input, protectedWords);
return new SnowballFilter(input, program);
} }
} }
class SnowballPorterFilter extends TokenFilter {
private final CharArraySet protWords;
private final SnowballProgram stemmer;
private final TermAttribute termAtt;
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
super(source);
this.protWords = protWords;
this.stemmer = stemmer;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
char[] termBuffer = termAtt.termBuffer();
int len = termAtt.termLength();
// if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return true;
}
stemmer.setCurrent(termBuffer, len);
stemmer.stem();
final char finalTerm[] = stemmer.getCurrentBuffer();
final int newLength = stemmer.getCurrentBufferLength();
if (finalTerm != termBuffer)
termAtt.setTermBuffer(finalTerm, 0, newLength);
else
termAtt.setTermLength(newLength);
return true;
}
}

View File

@ -17,18 +17,23 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* @version $Id$ * @version $Id$
*/ */
public class StandardTokenizerFactory extends BaseTokenizerFactory { public class StandardTokenizerFactory extends BaseTokenizerFactory {
public StandardTokenizer create(Reader input) { @Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion(); assureMatchVersion();
}
public StandardTokenizer create(Reader input) {
return new StandardTokenizer(luceneMatchVersion, input); return new StandardTokenizer(luceneMatchVersion, input);
} }
} }

View File

@ -0,0 +1,68 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Factory for {@link StemmerOverrideFilter}
*/
public class StemmerOverrideFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private CharArrayMap<String> dictionary = null;
private boolean ignoreCase;
public void inform(ResourceLoader loader) {
String dictionaryFiles = args.get("dictionary");
ignoreCase = getBoolean("ignoreCase", false);
if (dictionaryFiles != null) {
assureMatchVersion();
List<String> files = StrUtils.splitFileNames(dictionaryFiles);
try {
if (files.size() > 0) {
dictionary = new CharArrayMap<String>(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> list = loader.getLines(file.trim());
for (String line : list) {
String[] mapping = line.split("\t", 2);
dictionary.put(mapping[0], mapping[1]);
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public TokenStream create(TokenStream input) {
return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary);
}
}

View File

@ -18,18 +18,14 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet; import java.util.Map;
import java.util.List;
import java.io.File;
import java.util.Set; import java.util.Set;
import java.io.File;
import java.io.IOException; import java.io.IOException;
/** /**
@ -37,6 +33,12 @@ import java.io.IOException;
*/ */
public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String stopWordFiles = args.get("words"); String stopWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase",false); ignoreCase = getBoolean("ignoreCase",false);
@ -44,20 +46,12 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
if (stopWordFiles != null) { if (stopWordFiles != null) {
try { try {
List<String> files = StrUtils.splitFileNames(stopWordFiles); stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
if (stopWords == null && files.size() > 0){
//default stopwords list has 35 or so words, but maybe don't make it that big to start
stopWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
stopWords.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} else { } else {
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
} }
} }
@ -78,7 +72,6 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
} }
public StopFilter create(TokenStream input) { public StopFilter create(TokenStream input) {
assureMatchVersion();
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase); StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
stopFilter.setEnablePositionIncrements(enablePositionIncrements); stopFilter.setEnablePositionIncrements(enablePositionIncrements);
return stopFilter; return stopFilter;

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import java.io.IOException; import java.io.IOException;
@ -50,7 +49,7 @@ public class SynonymFilter extends TokenFilter {
public SynonymFilter(TokenStream in, SynonymMap map) { public SynonymFilter(TokenStream in, SynonymMap map) {
super(in); super(in);
this.map = map; this.map = map;
// just ensuring these exist attributes exist... // just ensuring these attributes exist...
addAttribute(TermAttribute.class); addAttribute(TermAttribute.class);
addAttribute(PositionIncrementAttribute.class); addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class); addAttribute(OffsetAttribute.class);
@ -88,7 +87,7 @@ public class SynonymFilter extends TokenFilter {
// common case fast-path of first token not matching anything // common case fast-path of first token not matching anything
AttributeSource firstTok = nextTok(); AttributeSource firstTok = nextTok();
if (firstTok == null) return false; if (firstTok == null) return false;
TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class); TermAttribute termAtt = firstTok.addAttribute(TermAttribute.class);
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null; SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
if (result == null) { if (result == null) {
copy(this, firstTok); copy(this, firstTok);
@ -121,7 +120,7 @@ public class SynonymFilter extends TokenFilter {
boolean includeOrig = result.includeOrig(); boolean includeOrig = result.includeOrig();
AttributeSource origTok = includeOrig ? firstTok : null; AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream int pos=0; // current position in merged token stream
@ -129,12 +128,11 @@ public class SynonymFilter extends TokenFilter {
for (int i=0; i<result.synonyms.length; i++) { for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i]; Token repTok = result.synonyms[i];
AttributeSource newTok = firstTok.cloneAttributes(); AttributeSource newTok = firstTok.cloneAttributes();
TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class); TermAttribute newTermAtt = newTok.addAttribute(TermAttribute.class);
OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class); OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength()); newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
@ -143,13 +141,13 @@ public class SynonymFilter extends TokenFilter {
// if necessary, insert original tokens and adjust position increment // if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) { while (origTok != null && origPos <= repPos) {
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos); origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok); generated.add(origTok);
pos += origPosInc.getPositionIncrement(); pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst(); origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) { if (origTok != null) {
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class); origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement(); origPos += origPosInc.getPositionIncrement();
} }
} }
@ -161,13 +159,13 @@ public class SynonymFilter extends TokenFilter {
// finish up any leftover original tokens // finish up any leftover original tokens
while (origTok!=null) { while (origTok!=null) {
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos); origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok); generated.add(origTok);
pos += origPosInc.getPositionIncrement(); pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst(); origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) { if (origTok != null) {
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class); origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement(); origPos += origPosInc.getPositionIncrement();
} }
} }
@ -217,7 +215,7 @@ public class SynonymFilter extends TokenFilter {
if (tok == this) if (tok == this)
tok = cloneAttributes(); tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class); TermAttribute termAtt = tok.getAttribute(TermAttribute.class);
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()); SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
if (subMap != null) { if (subMap != null) {
@ -243,12 +241,8 @@ public class SynonymFilter extends TokenFilter {
} }
private void copy(AttributeSource target, AttributeSource source) { private void copy(AttributeSource target, AttributeSource source) {
if (target == source) if (target != source)
return; source.copyTo(target);
for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
sourceIt.hasNext();) {
sourceIt.next().copyTo(targetIt.next());
}
} }
@Override @Override

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
@ -136,7 +135,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
TokenStream ts = loadTokenizer(tokFactory, reader); TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>(); List<String> tokList = new ArrayList<String>();
try { try {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
while (ts.incrementToken()){ while (ts.incrementToken()){
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength()); String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
if( text.length() > 0 ) if( text.length() > 0 )

View File

@ -17,8 +17,9 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.solr.util.CharArrayMap; import org.apache.lucene.util.Version;
import java.util.*; import java.util.*;
@ -52,7 +53,9 @@ public class SynonymMap {
SynonymMap currMap = this; SynonymMap currMap = this;
for (String str : singleMatch) { for (String str : singleMatch) {
if (currMap.submap==null) { if (currMap.submap==null) {
currMap.submap = new CharArrayMap<SynonymMap>(1, ignoreCase()); // for now hardcode at 2.9, as its what the old code did.
// would be nice to fix, but shouldn't store a version in each submap!!!
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_29, 1, ignoreCase());
} }
SynonymMap map = currMap.submap.get(str); SynonymMap map = currMap.submap.get(str);
@ -68,7 +71,7 @@ public class SynonymMap {
if (currMap.synonyms != null && !mergeExisting) { if (currMap.synonyms != null && !mergeExisting) {
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch); throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
} }
List superset = currMap.synonyms==null ? replacement : List<Token> superset = currMap.synonyms==null ? replacement :
mergeTokens(Arrays.asList(currMap.synonyms), replacement); mergeTokens(Arrays.asList(currMap.synonyms), replacement);
currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]); currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]);
if (includeOrig) currMap.flags |= INCLUDE_ORIG; if (includeOrig) currMap.flags |= INCLUDE_ORIG;

View File

@ -18,15 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.th.*; import org.apache.lucene.analysis.th.ThaiWordFilter;
import java.io.IOException;
import java.util.Locale;
import java.lang.Character.UnicodeBlock;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.text.BreakIterator;
import java.util.Map; /** Factory for {@link ThaiWordFilter} */
public class ThaiWordFilterFactory extends BaseTokenFilterFactory { public class ThaiWordFilterFactory extends BaseTokenFilterFactory {
public ThaiWordFilter create(TokenStream input) { public ThaiWordFilter create(TokenStream input) {
return new ThaiWordFilter(input); return new ThaiWordFilter(input);

View File

@ -18,13 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.payloads.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Payload;
import java.io.IOException; /** Factory for {@link TokenOffsetPayloadTokenFilter} */
import java.util.Map;
public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory { public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
public TokenOffsetPayloadTokenFilter create(TokenStream input) { public TokenOffsetPayloadTokenFilter create(TokenStream input) {
return new TokenOffsetPayloadTokenFilter(input); return new TokenOffsetPayloadTokenFilter(input);

View File

@ -23,7 +23,6 @@ import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$

View File

@ -19,7 +19,6 @@ package org.apache.solr.analysis;
import java.io.*; import java.io.*;
import java.util.Map; import java.util.Map;
import org.apache.solr.core.SolrConfig;
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.*;

View File

@ -16,7 +16,6 @@
*/ */
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -41,8 +40,8 @@ public final class TrimFilter extends TokenFilter {
super(in); super(in);
this.updateOffsets = updateOffsets; this.updateOffsets = updateOffsets;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); this.termAtt = addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); this.offsetAtt = addAttribute(OffsetAttribute.class);
} }
@Override @Override

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
/** Factory for TurkishLowerCaseFilter */ /** Factory for {@link TurkishLowerCaseFilter} */
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory { public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new TurkishLowerCaseFilter(input); return new TurkishLowerCaseFilter(input);

View File

@ -18,13 +18,11 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.payloads.*;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Payload; /** Factory for {@link TypeAsPayloadTokenFilter} */
import java.io.IOException;
import java.util.Map;
public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory { public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
public TypeAsPayloadTokenFilter create(TokenStream input) { public TypeAsPayloadTokenFilter create(TokenStream input) {
return new TypeAsPayloadTokenFilter(input); return new TypeAsPayloadTokenFilter(input);

View File

@ -17,17 +17,22 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* @version $Id$ * @version $Id$
*/ */
public class WhitespaceTokenizerFactory extends BaseTokenizerFactory { public class WhitespaceTokenizerFactory extends BaseTokenizerFactory {
public WhitespaceTokenizer create(Reader input) { @Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion(); assureMatchVersion();
}
public WhitespaceTokenizer create(Reader input) {
return new WhitespaceTokenizer(luceneMatchVersion,input); return new WhitespaceTokenizer(luceneMatchVersion,input);
} }
} }

View File

@ -120,10 +120,10 @@ final class WordDelimiterFilter extends TokenFilter {
*/ */
final CharArraySet protWords; final CharArraySet protWords;
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class); private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
// used for iterating word delimiter breaks // used for iterating word delimiter breaks
private final WordDelimiterIterator iterator; private final WordDelimiterIterator iterator;

View File

@ -21,12 +21,8 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import java.util.Map; import java.util.Map;
import java.io.File;
import java.util.List;
import java.io.IOException; import java.io.IOException;
@ -40,21 +36,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
String wordFiles = args.get(PROTECTED_TOKENS); String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) { if (wordFiles != null) {
try { try {
File protectedWordFiles = new File(wordFiles); protectedWords = getWordSet(loader, wordFiles, false);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }

View File

@ -1,411 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util;
import java.util.*;
import java.io.Serializable;
/**
* A simple class that stores key Strings as char[]'s in a
* hash table. Note that this is not a general purpose
* class. For example, it cannot remove items from the
* map, nor does it resize its hash table to be smaller,
* etc. It is designed to be quick to retrieve items
* by char[] keys without the necessity of converting
* to a String first.
*/
public class CharArrayMap<V> extends AbstractMap<String, V>
implements Map<String, V>, Cloneable, Serializable
{
private final static int INIT_SIZE = 2;
private char[][] keys;
private Object[] values;
private int count;
private final boolean ignoreCase;
/** Create map with enough capacity to hold startSize
* terms */
public CharArrayMap(int initialCapacity, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
// load factor of .75, inverse is 1.25, or x+x/4
initialCapacity = initialCapacity + (initialCapacity >>2);
while(size <= initialCapacity)
size <<= 1;
keys = new char[size][];
values = new Object[size];
}
public boolean ignoreCase() {
return ignoreCase;
}
public V get(char[] key) {
return get(key, 0, key.length);
}
public V get(char[] key, int off, int len) {
return (V)values[getSlot(key, off, len)];
}
public V get(CharSequence key) {
return (V)values[getSlot(key)];
}
@Override
public V get(Object key) {
return (V)values[getSlot(key)];
}
@Override
public boolean containsKey(Object s) {
return keys[getSlot(s)] != null;
}
@Override
public boolean containsValue(Object value) {
if (value == null) {
// search for key with a null value
for (int i=0; i<keys.length; i++) {
if (keys[i] != null && values[i] == null) return true;
}
return false;
}
for (int i=0; i<values.length; i++) {
Object val = values[i];
if (val != null && value.equals(val)) return true;
}
return false;
}
private int getSlot(Object key) {
if (key instanceof char[]) {
char[] keyc = (char[])key;
return getSlot(keyc, 0, keyc.length);
}
return getSlot((CharSequence)key);
}
private int getSlot(char[] key, int off, int len) {
int code = getHashCode(key, len);
int pos = code & (keys.length-1);
char[] key2 = keys[pos];
if (key2 != null && !equals(key, off, len, key2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (keys.length-1);
key2 = keys[pos];
} while (key2 != null && !equals(key, off, len, key2));
}
return pos;
}
/** Returns true if the String is in the set */
private int getSlot(CharSequence key) {
int code = getHashCode(key);
int pos = code & (keys.length-1);
char[] key2 = keys[pos];
if (key2 != null && !equals(key, key2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (keys.length-1);
key2 = keys[pos];
} while (key2 != null && !equals(key, key2));
}
return pos;
}
public V put(CharSequence key, V val) {
return put(key.toString(), val); // could be more efficient
}
@Override
public V put(String key, V val) {
return put(key.toCharArray(), val);
}
/** Add this key,val pair to the map.
* The char[] key is directly used, no copy is made.
* If ignoreCase is true for this Map, the key array will be directly modified.
* The user should never modify the key after calling this method.
*/
public V put(char[] key, Object val) {
if (ignoreCase)
for(int i=0;i< key.length;i++)
key[i] = Character.toLowerCase(key[i]);
int slot = getSlot(key, 0, key.length);
if (keys[slot] == null) count++;
Object prev = values[slot];
keys[slot] = key;
values[slot] = val;
if (count + (count>>2) >= keys.length) {
rehash();
}
return (V)prev;
}
private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1[off+i]) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1[off+i] != text2[i])
return false;
}
}
return true;
}
private boolean equals(CharSequence text1, char[] text2) {
int len = text1.length();
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1.charAt(i) != text2[i])
return false;
}
}
return true;
}
private void rehash() {
final int newSize = 2* keys.length;
char[][] oldEntries = keys;
Object[] oldValues = values;
keys = new char[newSize][];
values = new Object[newSize];
for(int i=0;i<oldEntries.length;i++) {
char[] key = oldEntries[i];
if (key != null) {
// todo: could be faster... no need to compare keys on collision
// since they are unique
int newSlot = getSlot(key,0,key.length);
keys[newSlot] = key;
values[newSlot] = oldValues[i];
}
}
}
private int getHashCode(char[] text, int len) {
int code = 0;
if (ignoreCase) {
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text[i]);
}
} else {
for (int i=0; i<len; i++) {
code = code*31 + text[i];
}
}
return code;
}
private int getHashCode(CharSequence text) {
int code;
if (ignoreCase) {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text.charAt(i));
}
} else {
if (false && text instanceof String) {
code = text.hashCode();
} else {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + text.charAt(i);
}
}
}
return code;
}
@Override
public int size() {
return count;
}
@Override
public boolean isEmpty() {
return count==0;
}
@Override
public void clear() {
count = 0;
Arrays.fill(keys,null);
Arrays.fill(values,null);
}
@Override
public Set<Entry<String, V>> entrySet() {
return new EntrySet();
}
/** Returns an EntryIterator over this Map. */
public EntryIterator iterator() {
return new EntryIterator();
}
/** public iterator class so efficient methods are exposed to users */
public class EntryIterator implements Iterator<Map.Entry<String,V>> {
int pos=-1;
int lastPos;
EntryIterator() {
goNext();
}
private void goNext() {
lastPos = pos;
pos++;
while (pos < keys.length && keys[pos] == null) pos++;
}
public boolean hasNext() {
return pos < keys.length;
}
/** gets the next key... do not modify the returned char[] */
public char[] nextKey() {
goNext();
return keys[lastPos];
}
/** gets the next key as a newly created String object */
public String nextKeyString() {
return new String(nextKey());
}
/** returns the value associated with the last key returned */
public V currentValue() {
return (V)values[lastPos];
}
/** sets the value associated with the last key returned */
public V setValue(V value) {
V old = (V)values[lastPos];
values[lastPos] = value;
return old;
}
/** Returns an Entry<String,V> object created on the fly...
* use nextCharArray() + currentValie() for better efficiency. */
public Map.Entry<String,V> next() {
goNext();
return new MapEntry(lastPos);
}
public void remove() {
throw new UnsupportedOperationException();
}
}
private class MapEntry implements Map.Entry<String,V> {
final int pos;
MapEntry(int pos) {
this.pos = pos;
}
public char[] getCharArr() {
return keys[pos];
}
public String getKey() {
return new String(getCharArr());
}
public V getValue() {
return (V)values[pos];
}
public V setValue(V value) {
V old = (V)values[pos];
values[pos] = value;
return old;
}
public String toString() {
return getKey() + '=' + getValue();
}
}
private class EntrySet extends AbstractSet<Map.Entry<String, V>> {
public EntryIterator iterator() {
return new EntryIterator();
}
public boolean contains(Object o) {
if (!(o instanceof Map.Entry))
return false;
Map.Entry e = (Map.Entry)o;
Object key = e.getKey();
if (key==null) return false; // we don't support null keys
Object val = e.getValue();
Object v = get(key);
return v==null ? val==null : v.equals(val);
}
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
public int size() {
return count;
}
public void clear() {
CharArrayMap.this.clear();
}
}
@Override
public Object clone() {
CharArrayMap<V> map = null;
try {
map = (CharArrayMap<V>)super.clone();
map.keys = keys.clone();
map.values = values.clone();
} catch (CloneNotSupportedException e) {
// impossible
}
return map;
}
}

View File

@ -21,13 +21,18 @@ import java.util.Collections;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
import org.apache.solr.core.Config;
/** /**
* General token testing helper functions * General token testing helper functions
*/ */
public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase
{ {
/** a map containing the default test version param for easy testing */
protected static final Map<String,String> DEFAULT_VERSION_PARAM = protected static final Map<String,String> DEFAULT_VERSION_PARAM =
Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT")); Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT"));
/** The default test version for easy testing */
public static final Version DEFAULT_VERSION = Config.parseLuceneVersionString(DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
} }

View File

@ -39,12 +39,12 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null); assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("words", "stop-1.txt"); args.put("words", "stop-1.txt");
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set words = factory.getCommonWords(); Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2); words.size() == 2);
@ -71,13 +71,13 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null); assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set words = factory.getCommonWords(); Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the")); assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory")); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, assertTokenStreamContents(stream,
new String[] { "testing", "testing_the", "the", "the_factory", "factory" }); new String[] { "testing", "testing_the", "the", "the_factory", "factory" });

View File

@ -35,10 +35,10 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
public void testReset() throws Exception { public void testReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?"; final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class); TermAttribute term = cgf.addAttribute(TermAttribute.class);
assertTrue(cgf.incrementToken()); assertTrue(cgf.incrementToken());
assertEquals("How", term.term()); assertEquals("How", term.term());
assertTrue(cgf.incrementToken()); assertTrue(cgf.incrementToken());
@ -56,11 +56,11 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
public void testQueryReset() throws Exception { public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?"; final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class); TermAttribute term = wt.addAttribute(TermAttribute.class);
assertTrue(nsf.incrementToken()); assertTrue(nsf.incrementToken());
assertEquals("How_the", term.term()); assertEquals("How_the", term.term());
assertTrue(nsf.incrementToken()); assertTrue(nsf.incrementToken());
@ -88,7 +88,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override @Override
public TokenStream tokenStream(String field, Reader in) { public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter( return new CommonGramsQueryFilter(new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords)); new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
} }
}; };
@ -157,7 +157,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override @Override
public TokenStream tokenStream(String field, Reader in) { public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter( return new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords); new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
} }
}; };
@ -243,7 +243,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testCaseSensitive() throws Exception { public void testCaseSensitive() throws Exception {
final String input = "How The s a brown s cow d like A B thing?"; final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
Set common = CommonGramsFilter.makeCommonSet(commonWords); Set common = CommonGramsFilter.makeCommonSet(commonWords);
TokenFilter cgf = new CommonGramsFilter(wt, common, false); TokenFilter cgf = new CommonGramsFilter(wt, common, false);
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s", assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
@ -256,7 +256,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testLastWordisStopWord() throws Exception { public void testLastWordisStopWord() throws Exception {
final String input = "dog the"; final String input = "dog the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "dog_the" }); assertTokenStreamContents(nsf, new String[] { "dog_the" });
@ -267,7 +267,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testFirstWordisStopWord() throws Exception { public void testFirstWordisStopWord() throws Exception {
final String input = "the dog"; final String input = "the dog";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_dog" }); assertTokenStreamContents(nsf, new String[] { "the_dog" });
@ -278,7 +278,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testOneWordQueryStopWord() throws Exception { public void testOneWordQueryStopWord() throws Exception {
final String input = "the"; final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" }); assertTokenStreamContents(nsf, new String[] { "the" });
@ -289,7 +289,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testOneWordQuery() throws Exception { public void testOneWordQuery() throws Exception {
final String input = "monster"; final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" }); assertTokenStreamContents(nsf, new String[] { "monster" });
@ -300,7 +300,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void TestFirstAndLastStopWord() throws Exception { public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of"; final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" }); assertTokenStreamContents(nsf, new String[] { "the_of" });

View File

@ -38,12 +38,12 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null); assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("words", "stop-1.txt"); args.put("words", "stop-1.txt");
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set words = factory.getCommonWords(); Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2); words.size() == 2);
@ -70,13 +70,13 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null); assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set words = factory.getCommonWords(); Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the")); assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory")); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, assertTokenStreamContents(stream,
new String[] { "testing_the", "the_factory" }); new String[] { "testing_the", "the_factory" });

View File

@ -29,7 +29,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
public void testDefaults() throws Exception { public void testDefaults() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>()); factory.init(new HashMap<String, String>());
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international")); TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream); TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
@ -43,7 +43,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
parameters.put("maxCodeLength", "8"); parameters.put("maxCodeLength", "8");
factory.init(parameters); factory.init(parameters);
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international")); TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream); TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
@ -56,10 +56,10 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
public void testReset() throws Exception { public void testReset() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>()); factory.init(new HashMap<String, String>());
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international")); TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream); TokenStream filteredStream = factory.create(inputStream);
TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class); TermAttribute termAtt = filteredStream.addAttribute(TermAttribute.class);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
assertTrue(filteredStream.incrementToken()); assertTrue(filteredStream.incrementToken());

View File

@ -24,42 +24,42 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase { public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
public void testSize4FalseInject() throws Exception { public void testSize4FalseInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("international")); TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "ANTR" }); assertTokenStreamContents(filter, new String[] { "ANTR" });
} }
public void testSize4TrueInject() throws Exception { public void testSize4TrueInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("international")); TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
assertTokenStreamContents(filter, new String[] { "international", "ANTR" }); assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
} }
public void testAlternateInjectFalse() throws Exception { public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski")); TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
} }
public void testSize8FalseInject() throws Exception { public void testSize8FalseInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("international")); TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
} }
public void testNonConvertableStringsWithInject() throws Exception { public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&")); TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
} }
public void testNonConvertableStringsWithoutInject() throws Exception { public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&")); TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
// should have something after the stream // should have something after the stream
stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello")); stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
filter = new DoubleMetaphoneFilter(stream, 8, false); filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
} }

View File

@ -46,11 +46,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
} }
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args); factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>())); factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
Tokenizer tokenizer = new WhitespaceTokenizer( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' '))); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold); assertTokenStreamContents(stream, gold);
@ -71,13 +71,13 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
} }
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt"); args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args); factory.init(args);
List<String> lines = new ArrayList<String>(); List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling"); Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines)); factory.inform(new LinesMockSolrResourceLoader(lines));
Tokenizer tokenizer = new WhitespaceTokenizer( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' '))); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold); assertTokenStreamContents(stream, gold);

View File

@ -33,7 +33,7 @@ public class LengthFilterTest extends BaseTokenTestCase {
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
factory.init(args); factory.init(args);
String test = "foo foobar super-duper-trooper"; String test = "foo foobar super-duper-trooper";
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test))); TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" }); assertTokenStreamContents(stream, new String[] { "foobar" });
} }
} }

View File

@ -48,12 +48,12 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
} }
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("language", "English"); args.put("language", "English");
factory.init(args); factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>())); factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
Tokenizer tokenizer = new WhitespaceTokenizer( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' '))); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold); assertTokenStreamContents(stream, gold);
@ -78,13 +78,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
} }
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt"); args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args); factory.init(args);
List<String> lines = new ArrayList<String>(); List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling"); Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines)); factory.inform(new LinesMockSolrResourceLoader(lines));
Tokenizer tokenizer = new WhitespaceTokenizer( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' '))); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold); assertTokenStreamContents(stream, gold);
@ -116,13 +116,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
public void testProtected() throws Exception { public void testProtected() throws Exception {
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put("protected", "protwords.txt"); args.put("protected", "protwords.txt");
args.put("language", "English"); args.put("language", "English");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Reader reader = new StringReader("ridding of some stemming"); Reader reader = new StringReader("ridding of some stemming");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" }); assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
} }

View File

@ -33,7 +33,7 @@ public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("Brasília"); Reader reader = new StringReader("Brasília");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory(); BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "brasil" }); assertTokenStreamContents(stream, new String[] { "brasil" });

View File

@ -59,7 +59,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String input = "How now A B brown A cow B like A B thing?"; final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now Q B brown A cow B like Q B thing?"; final String expected = "How now Q B brown A cow B like Q B thing?";
TokenStream ts = new AB_Q_Stream TokenStream ts = new AB_Q_Stream
(new WhitespaceTokenizer(new StringReader(input))); (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts, expected.split("\\s")); assertTokenStreamContents(ts, expected.split("\\s"));
} }
@ -67,15 +67,15 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String input = "How now A B brown A cow B like A B thing?"; final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now A A B brown A cow B like A A B thing?"; final String expected = "How now A A B brown A cow B like A A B thing?";
TokenStream ts = new AB_AAB_Stream TokenStream ts = new AB_AAB_Stream
(new WhitespaceTokenizer(new StringReader(input))); (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts, expected.split("\\s")); assertTokenStreamContents(ts, expected.split("\\s"));
} }
public void testReset() throws Exception { public void testReset() throws Exception {
final String input = "How now A B brown A cow B like A B thing?"; final String input = "How now A B brown A cow B like A B thing?";
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input)); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
TokenStream ts = new AB_AAB_Stream(tokenizer); TokenStream ts = new AB_AAB_Stream(tokenizer);
TermAttribute term = (TermAttribute) ts.addAttribute(TermAttribute.class); TermAttribute term = ts.addAttribute(TermAttribute.class);
assertTrue(ts.incrementToken()); assertTrue(ts.incrementToken());
assertEquals("How", term.term()); assertEquals("How", term.term());
assertTrue(ts.incrementToken()); assertTrue(ts.incrementToken());

View File

@ -33,7 +33,7 @@ public class TestBulgarianStemFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("компютри"); Reader reader = new StringReader("компютри");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory(); BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "компютр" }); assertTokenStreamContents(stream, new String[] { "компютр" });

View File

@ -34,7 +34,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
public void testCapitalization() throws Exception public void testCapitalization() throws Exception
{ {
Map<String,String> args = new HashMap<String, String>(); Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" ); args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" );
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
@ -74,18 +74,18 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
// now each token // now each token
factory.onlyFirstWord = false; factory.onlyFirstWord = false;
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan")); tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
stream = factory.create(tokenizer); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
// now only the long words // now only the long words
factory.minWordLength = 3; factory.minWordLength = 3;
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" )); tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
stream = factory.create(tokenizer); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
// without prefix // without prefix
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" )); tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
stream = factory.create(tokenizer); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Mckinley" }); assertTokenStreamContents(stream, new String[] { "Mckinley" });
@ -93,14 +93,14 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
factory = new CapitalizationFilterFactory(); factory = new CapitalizationFilterFactory();
args.put( "okPrefix", "McK" ); // all words args.put( "okPrefix", "McK" ); // all words
factory.init( args ); factory.init( args );
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" )); tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
stream = factory.create(tokenizer); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "McKinley" }); assertTokenStreamContents(stream, new String[] { "McKinley" });
// now try some stuff with numbers // now try some stuff with numbers
factory.forceFirstLetter = false; factory.forceFirstLetter = false;
factory.onlyFirstWord = false; factory.onlyFirstWord = false;
tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" )); tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
stream = factory.create(tokenizer); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" }); assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
@ -111,7 +111,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
} }
public void testKeepIgnoreCase() throws Exception { public void testKeepIgnoreCase() throws Exception {
Map<String,String> args = new HashMap<String, String>(); Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( CapitalizationFilterFactory.KEEP, "kitten" ); args.put( CapitalizationFilterFactory.KEEP, "kitten" );
args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" ); args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" );
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
@ -141,12 +141,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* This is very weird when combined with ONLY_FIRST_WORD!!! * This is very weird when combined with ONLY_FIRST_WORD!!!
*/ */
public void testMinWordLength() throws Exception { public void testMinWordLength() throws Exception {
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true"); args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5"); args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args); factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
"helo testing")); "helo testing"));
TokenStream ts = factory.create(tokenizer); TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"helo", "Testing"}); assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
@ -157,11 +157,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* in each token (it should do nothing) * in each token (it should do nothing)
*/ */
public void testMaxWordCount() throws Exception { public void testMaxWordCount() throws Exception {
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args); factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
"one two three four")); "one two three four"));
TokenStream ts = factory.create(tokenizer); TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"}); assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
@ -171,7 +171,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* Test CapitalizationFilterFactory's maxWordCount option when exceeded * Test CapitalizationFilterFactory's maxWordCount option when exceeded
*/ */
public void testMaxWordCount2() throws Exception { public void testMaxWordCount2() throws Exception {
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args); factory.init(args);
@ -187,11 +187,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* This is weird, it is not really a max, but inclusive (look at 'is') * This is weird, it is not really a max, but inclusive (look at 'is')
*/ */
public void testMaxTokenLength() throws Exception { public void testMaxTokenLength() throws Exception {
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2"); args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args); factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader( Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
"this is a test")); "this is a test"));
TokenStream ts = factory.create(tokenizer); TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"}); assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
@ -201,12 +201,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* Test CapitalizationFilterFactory's forceFirstLetter option * Test CapitalizationFilterFactory's forceFirstLetter option
*/ */
public void testForceFirstLetter() throws Exception { public void testForceFirstLetter() throws Exception {
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.KEEP, "kitten"); args.put(CapitalizationFilterFactory.KEEP, "kitten");
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true"); args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args); factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten")); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten"));
TokenStream ts = factory.create(tokenizer); TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"Kitten"}); assertTokenStreamContents(ts, new String[] {"Kitten"});
} }

View File

@ -33,7 +33,7 @@ public class TestChineseFilterFactory extends BaseTokenTestCase {
*/ */
public void testFiltering() throws Exception { public void testFiltering() throws Exception {
Reader reader = new StringReader("this 1234 Is such a silly filter"); Reader reader = new StringReader("this 1234 Is such a silly filter");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
ChineseFilterFactory factory = new ChineseFilterFactory(); ChineseFilterFactory factory = new ChineseFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" }); assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });

View File

@ -177,9 +177,9 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
throws IOException { throws IOException {
TermAttribute term1 = (TermAttribute) stream1 TermAttribute term1 = stream1
.addAttribute(TermAttribute.class); .addAttribute(TermAttribute.class);
TermAttribute term2 = (TermAttribute) stream2 TermAttribute term2 = stream2
.addAttribute(TermAttribute.class); .addAttribute(TermAttribute.class);
assertTrue(stream1.incrementToken()); assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken()); assertTrue(stream2.incrementToken());

View File

@ -33,7 +33,7 @@ public class TestCzechStemFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("angličtí"); Reader reader = new StringReader("angličtí");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
CzechStemFilterFactory factory = new CzechStemFilterFactory(); CzechStemFilterFactory factory = new CzechStemFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "anglick" }); assertTokenStreamContents(stream, new String[] { "anglick" });

View File

@ -21,8 +21,6 @@ import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@ -32,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrResourceLoader;
public class TestDelimitedPayloadTokenFilterFactory extends TestCase { public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenTestCase {
public void testEncoder() throws Exception { public void testEncoder() throws Exception {
Map<String,String> args = new HashMap<String, String>(); Map<String,String> args = new HashMap<String, String>();
@ -42,10 +40,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
factory.inform(loader); factory.inform(loader);
TokenStream input = new WhitespaceTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1")); TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the|0.1 quick|0.1 red|0.1"));
DelimitedPayloadTokenFilter tf = factory.create(input); DelimitedPayloadTokenFilter tf = factory.create(input);
while (tf.incrementToken()){ while (tf.incrementToken()){
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class); PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
assertTrue("payAttr is null and it shouldn't be", payAttr != null); assertTrue("payAttr is null and it shouldn't be", payAttr != null);
byte[] payData = payAttr.getPayload().getData(); byte[] payData = payAttr.getPayload().getData();
assertTrue("payData is null and it shouldn't be", payData != null); assertTrue("payData is null and it shouldn't be", payData != null);
@ -64,10 +62,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
factory.inform(loader); factory.inform(loader);
TokenStream input = new WhitespaceTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1")); TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the*0.1 quick*0.1 red*0.1"));
DelimitedPayloadTokenFilter tf = factory.create(input); DelimitedPayloadTokenFilter tf = factory.create(input);
while (tf.incrementToken()){ while (tf.incrementToken()){
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class); PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
assertTrue("payAttr is null and it shouldn't be", payAttr != null); assertTrue("payAttr is null and it shouldn't be", payAttr != null);
byte[] payData = payAttr.getPayload().getData(); byte[] payData = payAttr.getPayload().getData();
assertTrue("payData is null and it shouldn't be", payData != null); assertTrue("payData is null and it shouldn't be", payData != null);

View File

@ -37,10 +37,10 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestC
*/ */
public void testDecompounding() throws Exception { public void testDecompounding() throws Exception {
Reader reader = new StringReader("I like to play softball"); Reader reader = new StringReader("I like to play softball");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory(); DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put("dictionary", "compoundDictionary.txt"); args.put("dictionary", "compoundDictionary.txt");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);

View File

@ -33,7 +33,7 @@ public class TestDutchStemFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("lichamelijkheden"); Reader reader = new StringReader("lichamelijkheden");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
DutchStemFilterFactory factory = new DutchStemFilterFactory(); DutchStemFilterFactory factory = new DutchStemFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "licham" }); assertTokenStreamContents(stream, new String[] { "licham" });

View File

@ -37,7 +37,7 @@ public class TestElisionFilterFactory extends BaseTokenTestCase {
*/ */
public void testElision() throws Exception { public void testElision() throws Exception {
Reader reader = new StringReader("l'avion"); Reader reader = new StringReader("l'avion");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
ElisionFilterFactory factory = new ElisionFilterFactory(); ElisionFilterFactory factory = new ElisionFilterFactory();
factory.init(DEFAULT_VERSION_PARAM); factory.init(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);

View File

@ -33,7 +33,7 @@ public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("habitable"); Reader reader = new StringReader("habitable");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
FrenchStemFilterFactory factory = new FrenchStemFilterFactory(); FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "habit" }); assertTokenStreamContents(stream, new String[] { "habit" });

View File

@ -33,7 +33,7 @@ public class TestGermanStemFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("Tischen"); Reader reader = new StringReader("Tischen");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
GermanStemFilterFactory factory = new GermanStemFilterFactory(); GermanStemFilterFactory factory = new GermanStemFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "tisch" }); assertTokenStreamContents(stream, new String[] { "tisch" });

View File

@ -33,7 +33,7 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
*/ */
public void testStemming() throws Exception { public void testStemming() throws Exception {
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ"); Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
Tokenizer tokenizer = new WhitespaceTokenizer(reader); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory(); GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
TokenStream stream = factory.create(tokenizer); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" }); assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });

View File

@ -29,7 +29,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception { public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal"; String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
// first test // first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts); ts = factory.create(ts);
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenAtEnd() throws Exception { public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-"; String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test // first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts); ts = factory.create(ts);
assertTokenStreamContents(ts, assertTokenStreamContents(ts,

View File

@ -23,25 +23,22 @@ import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
import junit.framework.TestCase;
/** /**
* *
* *
**/ **/
public class TestKeepFilterFactory extends TestCase{ public class TestKeepFilterFactory extends BaseTokenTestCase{
public void testInform() throws Exception { public void testInform() throws Exception {
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null); assertTrue("loader is null and it shouldn't be", loader != null);
KeepWordFilterFactory factory = new KeepWordFilterFactory(); KeepWordFilterFactory factory = new KeepWordFilterFactory();
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("words", "keep-1.txt"); args.put("words", "keep-1.txt");
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set words = factory.getWords(); Set<?> words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);

View File

@ -41,7 +41,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
words.add( "bbb" ); words.add( "bbb" );
String input = "aaa BBB ccc ddd EEE"; String input = "aaa BBB ccc ddd EEE";
Map<String,String> args = new HashMap<String, String>(); Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null); ResourceLoader loader = new SolrResourceLoader(null, null);
// Test Stopwords // Test Stopwords
@ -51,29 +51,29 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
factory.inform( loader ); factory.inform( loader );
factory.setWords( words ); factory.setWords( words );
assertTrue(factory.isIgnoreCase()); assertTrue(factory.isIgnoreCase());
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Test Stopwords (ignoreCase via the setter instead) // Test Stopwords (ignoreCase via the setter instead)
factory = new KeepWordFilterFactory(); factory = new KeepWordFilterFactory();
args = new HashMap<String, String>(); args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init( args ); factory.init( args );
factory.inform( loader ); factory.inform( loader );
factory.setIgnoreCase(true); factory.setIgnoreCase(true);
factory.setWords( words ); factory.setWords( words );
assertTrue(factory.isIgnoreCase()); assertTrue(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Now force case // Now force case
factory = new KeepWordFilterFactory(); factory = new KeepWordFilterFactory();
args = new HashMap<String, String>(); args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( "ignoreCase", "false" ); args.put( "ignoreCase", "false" );
factory.init( args ); factory.init( args );
factory.inform( loader ); factory.inform( loader );
factory.setWords( words ); factory.setWords( words );
assertFalse(factory.isIgnoreCase()); assertFalse(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa" }); assertTokenStreamContents(stream, new String[] { "aaa" });
} }
} }

View File

@ -0,0 +1,65 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader;
/**
* Simple tests to ensure the keyword marker filter factory is working.
*/
public class TestKeywordMarkerFilterFactory extends BaseTokenTestCase {
public void testKeywords() throws IOException {
Reader reader = new StringReader("dogs cats");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);
args.put("protected", "protwords.txt");
factory.init(args);
factory.inform(loader);
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
}
public void testKeywordsCaseInsensitive() throws IOException {
Reader reader = new StringReader("dogs cats Cats");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);
args.put("protected", "protwords.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
}
}

View File

@ -20,7 +20,7 @@ public class TestMultiWordSynonyms extends BaseTokenTestCase {
SynonymMap synMap = new SynonymMap(true); SynonymMap synMap = new SynonymMap(true);
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null); SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap); SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
// This fails because ["e","e"] is the value of the token stream // This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" }); assertTokenStreamContents(ts, new String[] { "a", "e" });
} }

Some files were not shown because too many files have changed in this diff Show More