SOLR-1857: cleanup and sync analysis with Lucene trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@929782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-04-01 02:15:27 +00:00
parent a528a707c1
commit 3860c16a66
124 changed files with 771 additions and 1264 deletions

View File

@ -126,6 +126,14 @@ New Features
* SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble)
* SOLR-1857: Synced Solr analysis with Lucene 3.1. Added KeywordMarkerFilterFactory
and StemmerOverrideFilterFactory, which can be used to tune stemming algorithms.
Added factories for Bulgarian, Czech, Hindi, and Turkish analysis. Improved the
performance of SnowballPorterFilterFactory. (rmuir)
* SOLR-1657: Converted remaining TokenStreams to the Attributes-based API. All Solr
TokenFilters now support custom Attributes, and some have improved performance:
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
Optimizations
----------------------

View File

@ -18,9 +18,10 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.*;
import org.apache.lucene.util.ArrayUtil;
import java.util.Map;
import org.apache.lucene.analysis.ASCIIFoldingFilter;
import org.apache.lucene.analysis.TokenStream;
/** Factory for {@link ASCIIFoldingFilter} */
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
public ASCIIFoldingFilter create(TokenStream input) {
return new ASCIIFoldingFilter(input);

View File

@ -16,15 +16,13 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import java.io.Reader;
/**
*
*
* Factory for {@link ArabicLetterTokenizer}
**/
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{

View File

@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
/**
*
*
* Factory for {@link ArabicNormalizationFilter}
**/
public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{

View File

@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicStemFilter;
/**
*
*
* Factory for {@link ArabicStemFilter}
**/
public class ArabicStemFilterFactory extends BaseTokenFilterFactory{

View File

@ -17,13 +17,17 @@
package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.Config;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.IndexSchema;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.util.Version;
@ -94,4 +98,22 @@ abstract class BaseTokenStreamFactory {
return Boolean.parseBoolean(s);
}
protected CharArraySet getWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
assureMatchVersion();
List<String> files = StrUtils.splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
ignoreCase));
}
}
return words;
}
}

View File

@ -18,15 +18,10 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.br.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import java.util.Map;
import org.apache.lucene.analysis.br.BrazilianStemFilter;
/** Factory for {@link BrazilianStemFilter} */
public class BrazilianStemFilterFactory extends BaseTokenFilterFactory {
public BrazilianStemFilter create(TokenStream in) {
return new BrazilianStemFilter(in);

View File

@ -73,12 +73,12 @@ public abstract class BufferedTokenStream extends TokenFilter {
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public BufferedTokenStream(TokenStream input) {
super(input);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
/** Factory for BulgarianStemFilter */
/** Factory for {@link BulgarianStemFilter} */
public class BulgarianStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new BulgarianStemFilter(input);

View File

@ -18,11 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.cjk.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import java.io.Reader;
import java.util.Map;
/** Factory for {@link CJKTokenizer} */
public class CJKTokenizerFactory extends BaseTokenizerFactory {
public CJKTokenizer create(Reader in) {
return new CJKTokenizer(in);

View File

@ -75,6 +75,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
@Override
public void init(Map<String, String> args) {
super.init(args);
assureMatchVersion();
String k = args.get(KEEP);
if (k != null) {
@ -84,7 +85,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
if ("true".equalsIgnoreCase(ignoreStr)) {
ignoreCase = true;
}
keep = new CharArraySet(10, ignoreCase);
keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
while (st.hasMoreTokens()) {
k = st.nextToken().trim();
keep.add(k.toCharArray());
@ -194,7 +195,7 @@ class CapitalizationFilter extends TokenFilter {
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
super(in);
this.factory = factory;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
}
@Override

View File

@ -18,10 +18,14 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.cn.*;
import java.util.Hashtable;
import org.apache.lucene.analysis.*;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.ChineseFilter;
/**
* Factory for {@link ChineseFilter}
* @deprecated Use {@link StopFilterFactory} instead.
*/
@Deprecated
public class ChineseFilterFactory extends BaseTokenFilterFactory {
public ChineseFilter create(TokenStream in) {
return new ChineseFilter(in);

View File

@ -18,10 +18,15 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.cn.*;
import java.io.Reader;
import org.apache.lucene.analysis.*;
import java.util.Map;
import org.apache.lucene.analysis.cn.ChineseTokenizer;
/**
* Factory for {@link ChineseTokenizer}
* @deprecated Use {@link StandardTokenizerFactory} instead.
*/
@Deprecated
public class ChineseTokenizerFactory extends BaseTokenizerFactory {
public ChineseTokenizer create(Reader in) {
return new ChineseTokenizer(in);

View File

@ -20,6 +20,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
/*
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
@ -51,15 +52,25 @@ public final class CommonGramsFilter extends TokenFilter {
private final StringBuilder buffer = new StringBuilder();
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private int lastStartOffset;
private boolean lastWasCommon;
private State savedState;
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */
public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
this(Version.LUCENE_29, input, commonWords);
}
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */
public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean ignoreCase) {
this(Version.LUCENE_29, input, commonWords, ignoreCase);
}
/**
* Construct a token stream filtering the given input using a Set of common
* words to create bigrams. Outputs both unigrams with position increment and
@ -69,8 +80,8 @@ public final class CommonGramsFilter extends TokenFilter {
* @param input TokenStream input in filter chain
* @param commonWords The set of common words.
*/
public CommonGramsFilter(TokenStream input, Set commonWords) {
this(input, commonWords, false);
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
this(matchVersion, input, commonWords, false);
}
/**
@ -90,12 +101,12 @@ public final class CommonGramsFilter extends TokenFilter {
* @param commonWords The set of common words.
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) {
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
super(input);
if (commonWords instanceof CharArraySet) {
this.commonWords = (CharArraySet) commonWords;
} else {
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
this.commonWords.addAll(commonWords);
}
}
@ -106,7 +117,9 @@ public final class CommonGramsFilter extends TokenFilter {
*
* @param input Tokenstream in filter chain
* @param commonWords words to be used in constructing bigrams
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
*/
@Deprecated
public CommonGramsFilter(TokenStream input, String[] commonWords) {
this(input, commonWords, false);
}
@ -118,7 +131,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param input Tokenstream in filter chain
* @param commonWords words to be used in constructing bigrams
* @param ignoreCase -Ignore case when constructing bigrams for common words.
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead.
*/
@Deprecated
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
super(input);
this.commonWords = makeCommonSet(commonWords, ignoreCase);
@ -132,7 +147,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param commonWords Array of common words which will be converted into the CharArraySet
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
* @deprecated create a CharArraySet with CharArraySet instead
*/
@Deprecated
public static CharArraySet makeCommonSet(String[] commonWords) {
return makeCommonSet(commonWords, false);
}
@ -145,7 +162,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param commonWords Array of common words which will be converted into the CharArraySet
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
* @deprecated create a CharArraySet with CharArraySet instead
*/
@Deprecated
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
commonSet.addAll(Arrays.asList(commonWords));

View File

@ -17,14 +17,12 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
@ -43,16 +41,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
if (commonWordFiles != null) {
try {
List<String> files = StrUtils.splitFileNames(commonWordFiles);
if (commonWords == null && files.size() > 0){
//default stopwords list has 35 or so words, but maybe don't make it that big to start
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -69,12 +58,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
return ignoreCase;
}
public Set getCommonWords() {
public Set<?> getCommonWords() {
return commonWords;
}
public CommonGramsFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase);
return commonGrams;
}
}

View File

@ -47,8 +47,8 @@ import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
*/
public final class CommonGramsQueryFilter extends TokenFilter {
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private State previous;
private String previousType;

View File

@ -17,14 +17,13 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
@ -36,25 +35,19 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
implements ResourceLoaderAware {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public void inform(ResourceLoader loader) {
String commonWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
if (commonWordFiles != null) {
try {
List<String> files = StrUtils.splitFileNames(commonWordFiles);
if (commonWords == null && files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it
// that big to start
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
// TODO: once StopFilter.makeStopSet(List) method is available, switch
// to using that so we can avoid a toArray() call
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
.toArray(new String[0]), ignoreCase));
}
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -73,7 +66,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
return ignoreCase;
}
public Set getCommonWords() {
public Set<?> getCommonWords() {
return commonWords;
}
@ -81,7 +74,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
*/
public CommonGramsQueryFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords,
ignoreCase);
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
commonGrams);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cz.CzechStemFilter;
/** Factory for CzechStemFilter */
/** Factory for {@link CzechStemFilter} */
public class CzechStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new CzechStemFilter(input);

View File

@ -31,7 +31,7 @@ import java.util.Map;
/**
*
*
* Factory for {@link DelimitedPayloadTokenFilter}
**/
public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String ENCODER_ATTR = "encoder";

View File

@ -18,20 +18,18 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.compound.*;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import java.util.List;
import java.util.Set;
import java.util.Map;
import java.io.IOException;
/** Factory for {@link DictionaryCompoundWordTokenFilter} */
public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Set dictionary;
private CharArraySet dictionary;
private String dictFile;
private int minWordSize;
private int minSubwordSize;
@ -39,6 +37,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
private boolean onlyLongestMatch;
public void init(Map<String, String> args) {
super.init(args);
assureMatchVersion();
dictFile = args.get("dictionary");
if (null == dictFile) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
@ -52,14 +51,13 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
}
public void inform(ResourceLoader loader) {
try {
List<String> wlist = loader.getLines(dictFile);
dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
dictionary = super.getWordSet(loader, dictFile, false);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public DictionaryCompoundWordTokenFilter create(TokenStream input) {
return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
return new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
}
}

View File

@ -20,11 +20,9 @@ import java.io.IOException;
import java.util.LinkedList;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class DoubleMetaphoneFilter extends TokenFilter {
@ -41,8 +39,8 @@ public class DoubleMetaphoneFilter extends TokenFilter {
super(input);
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
this.posAtt = addAttribute(PositionIncrementAttribute.class);
}
@Override

View File

@ -18,19 +18,19 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.nl.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.Map;
import java.util.Map;
/**
* @deprecated Use {@link SnowballPorterFilterFactory} with "Dutch" instead,
* which has the same functionality.
*/
@Deprecated
public class DutchStemFilterFactory extends BaseTokenFilterFactory {
public DutchStemFilter create(TokenStream _in) {
return new DutchStemFilter(_in);
public TokenFilter create(TokenStream _in) {
return new SnowballFilter(_in, new org.tartarus.snowball.ext.DutchStemmer());
}
}

View File

@ -21,32 +21,22 @@ package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.fr.*;
import java.io.IOException;
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import java.util.Map;
import java.util.List;
import java.util.Set;
import java.io.IOException;
/** Factory for {@link ElisionFilter} */
public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private Set articles;
private CharArraySet articles;
public void inform(ResourceLoader loader) {
String articlesFile = args.get("articles");
if (articlesFile != null) {
try {
List<String> wlist = loader.getLines(articlesFile);
articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
articles = getWordSet(loader, articlesFile, false);
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -18,17 +18,14 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram;
import java.io.IOException;
import java.io.File;
import java.util.List;
/**
* @version $Id$
@ -42,21 +39,7 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
protectedWords = getWordSet(loader, wordFiles, false);
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -65,20 +48,10 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
private CharArraySet protectedWords = null;
public EnglishPorterFilter create(TokenStream input) {
return new EnglishPorterFilter(input, protectedWords);
public TokenFilter create(TokenStream input) {
if (protectedWords != null)
input = new KeywordMarkerTokenFilter(input, protectedWords);
return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer());
}
}
/**
* English Porter2 filter that doesn't use reflection to
* adapt lucene to the snowball stemmer code.
*/
@Deprecated
class EnglishPorterFilter extends SnowballPorterFilter {
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords);
}
}

View File

@ -18,18 +18,19 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.fr.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
import java.util.HashSet;
import java.util.Set;
import java.util.Map;
/**
* @deprecated Use {@link SnowballPorterFilterFactory} with "French" instead,
* which has the same functionality.
*/
@Deprecated
public class FrenchStemFilterFactory extends BaseTokenFilterFactory {
public FrenchStemFilter create(TokenStream in) {
return new FrenchStemFilter(in);
public TokenFilter create(TokenStream in) {
return new SnowballFilter(in, new org.tartarus.snowball.ext.FrenchStemmer());
}
}

View File

@ -18,13 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.de.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Set;
import java.util.Map;
/** Factory for {@link GermanStemFilter} */
public class GermanStemFilterFactory extends BaseTokenFilterFactory {
public GermanStemFilter create(TokenStream in) {
return new GermanStemFilter(in);

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
/** Factory for {@link GreekLowerCaseFilter} */
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
{

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
/** Factory for HindiNormalizationFilter */
/** Factory for {@link HindiNormalizationFilter} */
public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new HindiNormalizationFilter(input);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hi.HindiStemFilter;
/** Factory for HindiStemFilter */
/** Factory for {@link HindiStemFilter} */
public class HindiStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new HindiStemFilter(input);

View File

@ -54,8 +54,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*/
public final class HyphenatedWordsFilter extends TokenFilter {
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final StringBuilder hyphenated = new StringBuilder();
private State savedState;

View File

@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenFilterFactory;
/**
* Factory for HyphenatedWordsFilter
* Factory for {@link HyphenatedWordsFilter}
*/
public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
public HyphenatedWordsFilter create(TokenStream input) {

View File

@ -21,8 +21,10 @@ import org.apache.lucene.analysis.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.TokenStream;
/** Factory for ISOLatin1AccentFilter
* @deprecated Use {@link ASCIIFoldingFilterFactory} instead.
* $Id$
*/
@Deprecated
public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory {
public ISOLatin1AccentFilter create(TokenStream input) {
return new ISOLatin1AccentFilter(input);

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
/** Factory for IndicNormalizationFilter */
/** Factory for {@link IndicNormalizationFilter} */
public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new IndicNormalizationFilter(input);

View File

@ -22,7 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.in.IndicTokenizer;
/** Factory for IndicTokenizer */
/** Factory for {@link IndicTokenizer} */
public class IndicTokenizerFactory extends BaseTokenizerFactory {
public Tokenizer create(Reader input) {
assureMatchVersion();

View File

@ -19,10 +19,8 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.tartarus.snowball.SnowballProgram;
import java.io.IOException;
import java.util.Set;
@ -38,6 +36,8 @@ public final class KeepWordFilter extends TokenFilter {
private final CharArraySet words;
private final TermAttribute termAtt;
/** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
@Deprecated
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
this(in, new CharArraySet(words, ignoreCase));
}
@ -47,7 +47,7 @@ public final class KeepWordFilter extends TokenFilter {
public KeepWordFilter(TokenStream in, CharArraySet words) {
super(in);
this.words = words;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
}
@Override

View File

@ -18,17 +18,11 @@
package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.io.File;
import java.io.File;
import java.io.IOException;
/**
@ -40,23 +34,13 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
private CharArraySet words;
private boolean ignoreCase;
@SuppressWarnings("unchecked")
public void inform(ResourceLoader loader) {
String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
if (wordFiles != null) {
try {
List<String> files = StrUtils.splitFileNames(wordFiles);
if (words == null && files.size() > 0){
words = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase));
}
}
catch (IOException e) {
words = getWordSet(loader, wordFiles, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@ -67,14 +51,14 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
* NOTE: if ignoreCase==true, the words are expected to be lowercase
*/
public void setWords(Set<String> words) {
this.words = new CharArraySet(words, ignoreCase);
this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
}
public void setIgnoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
if (words != null) {
words = new CharArraySet(words, ignoreCase);
public void setIgnoreCase(boolean ignoreCase) {
if (words != null && this.ignoreCase != ignoreCase) {
words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
}
this.ignoreCase = ignoreCase;
}
public KeepWordFilter create(TokenStream input) {

View File

@ -0,0 +1,55 @@
package org.apache.solr.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Factory for {@link KeywordMarkerTokenFilter}
*/
public class KeywordMarkerFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
private CharArraySet protectedWords;
private boolean ignoreCase;
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
try {
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public TokenStream create(TokenStream input) {
return protectedWords == null ? input : new KeywordMarkerTokenFilter(input, protectedWords);
}
}

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.KeywordTokenizer;
import java.io.Reader;

View File

@ -17,17 +17,23 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LetterTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* @version $Id$
*/
public class LetterTokenizerFactory extends BaseTokenizerFactory {
public LetterTokenizer create(Reader input) {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public LetterTokenizer create(Reader input) {
return new LetterTokenizer(luceneMatchVersion, input);
}
}

View File

@ -17,6 +17,8 @@
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter;
@ -24,8 +26,13 @@ import org.apache.lucene.analysis.LowerCaseFilter;
* @version $Id$
*/
public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
public LowerCaseFilter create(TokenStream input) {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public LowerCaseFilter create(TokenStream input) {
return new LowerCaseFilter(luceneMatchVersion,input);
}
}

View File

@ -17,17 +17,22 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* @version $Id$
*/
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
public LowerCaseTokenizer create(Reader input) {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public LowerCaseTokenizer create(Reader input) {
return new LowerCaseTokenizer(luceneMatchVersion,input);
}
}

View File

@ -18,13 +18,12 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.payloads.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Payload;
import java.io.IOException;
import java.util.Map;
/** Factory for {@link NumericPayloadTokenFilter} */
public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
private float payload;
private String typeMatch;

View File

@ -19,13 +19,10 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.Set;
import java.io.IOException;
import java.nio.CharBuffer;
@ -66,7 +63,7 @@ public final class PatternReplaceFilter extends TokenFilter {
this.p=p;
this.replacement = (null == replacement) ? "" : replacement;
this.all=all;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
}
@Override

View File

@ -56,8 +56,8 @@ import org.apache.commons.io.IOUtils;
*/
public final class PatternTokenizer extends Tokenizer {
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private String str;
private int index;

View File

@ -18,12 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.fa.*;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.Map;
/** Factory for {@link PersianNormalizationFilter} */
public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory {
public PersianNormalizationFilter create(TokenStream input) {
return new PersianNormalizationFilter(input);

View File

@ -20,7 +20,6 @@ package org.apache.solr.analysis;
import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -47,8 +46,8 @@ public class PhoneticFilter extends TokenFilter
this.encoder = encoder;
this.name = name;
this.inject = inject;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
this.posAtt = addAttribute(PositionIncrementAttribute.class);
}
@Override

View File

@ -17,11 +17,12 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.util.CharArrayMap;
import org.apache.lucene.util.Version;
import java.io.IOException;
@ -30,12 +31,11 @@ import java.io.IOException;
*/
public final class RemoveDuplicatesTokenFilter extends TokenFilter {
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
// keep a seen 'set' after each term with posInc > 0
// for now use CharArrayMap vs CharArraySet, as it has clear()
private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
// use a fixed version, as we don't care about case sensitivity.
private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
/**
* Creates a new RemoveDuplicatesTokenFilter
@ -60,12 +60,12 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
previous.clear();
}
boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null);
boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length));
// clone the term, and add to the set of seen terms.
char saved[] = new char[length];
System.arraycopy(term, 0, saved, 0, length);
previous.put(saved, Boolean.TRUE);
previous.add(saved);
if (!duplicate) {
return true;

View File

@ -45,8 +45,8 @@ public class ReversedWildcardFilter extends TokenFilter {
protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
super(input);
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
this.posAtt = addAttribute(PositionIncrementAttribute.class);
this.withOriginal = withOriginal;
this.markerChar = markerChar;
}

View File

@ -1,61 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//package org.apache.solr.analysis;
//import org.apache.lucene.analysis.ru.*;
//import java.util.Map;
//import java.util.HashMap;
//import org.apache.solr.core.SolrConfig;
//import org.apache.solr.common.SolrException;
//import org.apache.solr.common.SolrException.ErrorCode;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//@Deprecated
//public class RussianCommon {
//
// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
//
// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
// static {
// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
// CHARSETS.put("KOI8",RussianCharsets.KOI8);
// CHARSETS.put("CP1251",RussianCharsets.CP1251);
// }
//
// public static char[] getCharset(String name) {
// if (null == name)
// return RussianCharsets.UnicodeRussian;
//
// char[] charset = CHARSETS.get(name);
//
// if (charset.equals(RussianCharsets.UnicodeRussian))
// logger.warn("Specifying UnicodeRussian is no longer required (default). "
// + "Use of the charset parameter will cause an error in Solr 1.5");
// else
// logger.warn("Support for this custom encoding is deprecated. "
// + "Use of the charset parameter will cause an error in Solr 1.5");
//
// if (null == charset) {
// throw new SolrException(ErrorCode.SERVER_ERROR,
// "Don't understand charset: " + name);
// }
// return charset;
// }
//}

View File

@ -24,6 +24,10 @@ import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
/** @deprecated Use {@link StandardTokenizerFactory} instead.
* This tokenizer has no Russian-specific functionality.
*/
@Deprecated
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
@Override

View File

@ -19,11 +19,17 @@ package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
/** @deprecated Use {@link LowerCaseFilterFactory} instead which has the
* same functionality.
*/
@Deprecated
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
@Override
@ -35,8 +41,9 @@ public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
+ "Please process your documents as Unicode instead.");
}
public RussianLowerCaseFilter create(TokenStream in) {
return new RussianLowerCaseFilter(in);
public TokenFilter create(TokenStream in) {
// hardcode the version to give exactly the old behavior
return new LowerCaseFilter(Version.LUCENE_29, in);
}
}

View File

@ -19,16 +19,19 @@
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianStemFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
/**
* @deprecated Use {@link SnowballPorterFilterFactory} with "Russian" instead,
* which has the same functionality.
*/
@Deprecated
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
public RussianStemFilter create(TokenStream in) {
return new RussianStemFilter(in);
public TokenFilter create(TokenStream in) {
return new SnowballFilter(in, new org.tartarus.snowball.ext.RussianStemmer());
}
}

View File

@ -18,14 +18,12 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.shingle.*;
import java.io.IOException;
import java.util.LinkedList;
import java.util.Iterator;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.util.Map;
/** Factory for {@link ShingleFilter} */
public class ShingleFilterFactory extends BaseTokenFilterFactory {
private int maxShingleSize;
private boolean outputUnigrams;

View File

@ -17,26 +17,21 @@
package org.apache.solr.analysis;
import java.util.Map;
import java.util.List;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram;
/**
* Factory for SnowballFilters, with configurable language
*
* Browsing the code, SnowballFilter uses reflection to adapt to Lucene... don't
* use this if you are concerned about speed. Use EnglishPorterFilterFactory.
* Factory for {@link SnowballFilter}, with configurable language
* <p>
* Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection.
*
* @version $Id$
*/
@ -44,28 +39,14 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
public static final String PROTECTED_TOKENS = "protected";
private String language = "English";
private Class stemClass;
private Class<?> stemClass;
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
protectedWords = getWordSet(loader, wordFiles, false);
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -87,50 +68,17 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
}
}
public SnowballPorterFilter create(TokenStream input) {
public TokenFilter create(TokenStream input) {
SnowballProgram program;
try {
program = (SnowballProgram)stemClass.newInstance();
} catch (Exception e) {
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
}
return new SnowballPorterFilter(input, program, protectedWords);
if (protectedWords != null)
input = new KeywordMarkerTokenFilter(input, protectedWords);
return new SnowballFilter(input, program);
}
}
class SnowballPorterFilter extends TokenFilter {
private final CharArraySet protWords;
private final SnowballProgram stemmer;
private final TermAttribute termAtt;
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
super(source);
this.protWords = protWords;
this.stemmer = stemmer;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
char[] termBuffer = termAtt.termBuffer();
int len = termAtt.termLength();
// if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return true;
}
stemmer.setCurrent(termBuffer, len);
stemmer.stem();
final char finalTerm[] = stemmer.getCurrentBuffer();
final int newLength = stemmer.getCurrentBufferLength();
if (finalTerm != termBuffer)
termAtt.setTermBuffer(finalTerm, 0, newLength);
else
termAtt.setTermLength(newLength);
return true;
}
}

View File

@ -17,18 +17,23 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* @version $Id$
*/
public class StandardTokenizerFactory extends BaseTokenizerFactory {
public StandardTokenizer create(Reader input) {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public StandardTokenizer create(Reader input) {
return new StandardTokenizer(luceneMatchVersion, input);
}
}

View File

@ -0,0 +1,68 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Factory for {@link StemmerOverrideFilter}
*/
public class StemmerOverrideFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private CharArrayMap<String> dictionary = null;
private boolean ignoreCase;
public void inform(ResourceLoader loader) {
String dictionaryFiles = args.get("dictionary");
ignoreCase = getBoolean("ignoreCase", false);
if (dictionaryFiles != null) {
assureMatchVersion();
List<String> files = StrUtils.splitFileNames(dictionaryFiles);
try {
if (files.size() > 0) {
dictionary = new CharArrayMap<String>(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> list = loader.getLines(file.trim());
for (String line : list) {
String[] mapping = line.split("\t", 2);
dictionary.put(mapping[0], mapping[1]);
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public TokenStream create(TokenStream input) {
return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary);
}
}

View File

@ -18,18 +18,14 @@
package org.apache.solr.analysis;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import java.util.HashSet;
import java.util.List;
import java.io.File;
import java.util.Map;
import java.util.Set;
import java.io.File;
import java.io.IOException;
/**
@ -37,6 +33,12 @@ import java.io.IOException;
*/
public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public void inform(ResourceLoader loader) {
String stopWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase",false);
@ -44,20 +46,12 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
if (stopWordFiles != null) {
try {
List<String> files = StrUtils.splitFileNames(stopWordFiles);
if (stopWords == null && files.size() > 0){
//default stopwords list has 35 or so words, but maybe don't make it that big to start
stopWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
stopWords.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
}
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
}
}
@ -78,7 +72,6 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
}
public StopFilter create(TokenStream input) {
assureMatchVersion();
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
return stopFilter;

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
@ -50,7 +49,7 @@ public class SynonymFilter extends TokenFilter {
public SynonymFilter(TokenStream in, SynonymMap map) {
super(in);
this.map = map;
// just ensuring these exist attributes exist...
// just ensuring these attributes exist...
addAttribute(TermAttribute.class);
addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class);
@ -88,7 +87,7 @@ public class SynonymFilter extends TokenFilter {
// common case fast-path of first token not matching anything
AttributeSource firstTok = nextTok();
if (firstTok == null) return false;
TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class);
TermAttribute termAtt = firstTok.addAttribute(TermAttribute.class);
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
if (result == null) {
copy(this, firstTok);
@ -121,7 +120,7 @@ public class SynonymFilter extends TokenFilter {
boolean includeOrig = result.includeOrig();
AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream
@ -129,12 +128,11 @@ public class SynonymFilter extends TokenFilter {
for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
AttributeSource newTok = firstTok.cloneAttributes();
TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class);
OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class);
TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class);
PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
TermAttribute newTermAtt = newTok.addAttribute(TermAttribute.class);
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class);
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
@ -143,13 +141,13 @@ public class SynonymFilter extends TokenFilter {
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
@ -161,13 +159,13 @@ public class SynonymFilter extends TokenFilter {
// finish up any leftover original tokens
while (origTok!=null) {
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
@ -217,7 +215,7 @@ public class SynonymFilter extends TokenFilter {
if (tok == this)
tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class);
TermAttribute termAtt = tok.getAttribute(TermAttribute.class);
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
if (subMap != null) {
@ -243,12 +241,8 @@ public class SynonymFilter extends TokenFilter {
}
private void copy(AttributeSource target, AttributeSource source) {
if (target == source)
return;
for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
sourceIt.hasNext();) {
sourceIt.next().copyTo(targetIt.next());
}
if (target != source)
source.copyTo(target);
}
@Override

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.common.ResourceLoader;
@ -136,7 +135,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
while (ts.incrementToken()){
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
if( text.length() > 0 )

View File

@ -17,8 +17,9 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.Token;
import org.apache.solr.util.CharArrayMap;
import org.apache.lucene.util.Version;
import java.util.*;
@ -52,7 +53,9 @@ public class SynonymMap {
SynonymMap currMap = this;
for (String str : singleMatch) {
if (currMap.submap==null) {
currMap.submap = new CharArrayMap<SynonymMap>(1, ignoreCase());
// for now hardcode at 2.9, as its what the old code did.
// would be nice to fix, but shouldn't store a version in each submap!!!
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_29, 1, ignoreCase());
}
SynonymMap map = currMap.submap.get(str);
@ -68,7 +71,7 @@ public class SynonymMap {
if (currMap.synonyms != null && !mergeExisting) {
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
}
List superset = currMap.synonyms==null ? replacement :
List<Token> superset = currMap.synonyms==null ? replacement :
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]);
if (includeOrig) currMap.flags |= INCLUDE_ORIG;

View File

@ -18,15 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.th.*;
import java.io.IOException;
import java.util.Locale;
import java.lang.Character.UnicodeBlock;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.th.ThaiWordFilter;
import org.apache.lucene.analysis.TokenStream;
import java.text.BreakIterator;
import java.util.Map;
/** Factory for {@link ThaiWordFilter} */
public class ThaiWordFilterFactory extends BaseTokenFilterFactory {
public ThaiWordFilter create(TokenStream input) {
return new ThaiWordFilter(input);

View File

@ -18,13 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.payloads.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Payload;
import java.io.IOException;
import java.util.Map;
/** Factory for {@link TokenOffsetPayloadTokenFilter} */
public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
public TokenOffsetPayloadTokenFilter create(TokenStream input) {
return new TokenOffsetPayloadTokenFilter(input);

View File

@ -23,7 +23,6 @@ import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader;
import java.io.IOException;
/**
* @version $Id$

View File

@ -19,7 +19,6 @@ package org.apache.solr.analysis;
import java.io.*;
import java.util.Map;
import org.apache.solr.core.SolrConfig;
import org.apache.lucene.analysis.*;

View File

@ -16,7 +16,6 @@
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException;

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -41,8 +40,8 @@ public final class TrimFilter extends TokenFilter {
super(in);
this.updateOffsets = updateOffsets;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
this.termAtt = addAttribute(TermAttribute.class);
this.offsetAtt = addAttribute(OffsetAttribute.class);
}
@Override

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
/** Factory for TurkishLowerCaseFilter */
/** Factory for {@link TurkishLowerCaseFilter} */
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new TurkishLowerCaseFilter(input);

View File

@ -18,13 +18,11 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.payloads.*;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Payload;
import java.io.IOException;
import java.util.Map;
/** Factory for {@link TypeAsPayloadTokenFilter} */
public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
public TypeAsPayloadTokenFilter create(TokenStream input) {
return new TypeAsPayloadTokenFilter(input);

View File

@ -17,17 +17,22 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* @version $Id$
*/
public class WhitespaceTokenizerFactory extends BaseTokenizerFactory {
public WhitespaceTokenizer create(Reader input) {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public WhitespaceTokenizer create(Reader input) {
return new WhitespaceTokenizer(luceneMatchVersion,input);
}
}

View File

@ -120,10 +120,10 @@ final class WordDelimiterFilter extends TokenFilter {
*/
final CharArraySet protWords;
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
// used for iterating word delimiter breaks
private final WordDelimiterIterator iterator;

View File

@ -21,12 +21,8 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import java.util.Map;
import java.io.File;
import java.util.List;
import java.io.IOException;
@ -40,21 +36,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
protectedWords = getWordSet(loader, wordFiles, false);
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -1,411 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util;
import java.util.*;
import java.io.Serializable;
/**
* A simple class that stores key Strings as char[]'s in a
* hash table. Note that this is not a general purpose
* class. For example, it cannot remove items from the
* map, nor does it resize its hash table to be smaller,
* etc. It is designed to be quick to retrieve items
* by char[] keys without the necessity of converting
* to a String first.
*/
public class CharArrayMap<V> extends AbstractMap<String, V>
implements Map<String, V>, Cloneable, Serializable
{
private final static int INIT_SIZE = 2;
private char[][] keys;
private Object[] values;
private int count;
private final boolean ignoreCase;
/** Create map with enough capacity to hold startSize
* terms */
public CharArrayMap(int initialCapacity, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
// load factor of .75, inverse is 1.25, or x+x/4
initialCapacity = initialCapacity + (initialCapacity >>2);
while(size <= initialCapacity)
size <<= 1;
keys = new char[size][];
values = new Object[size];
}
public boolean ignoreCase() {
return ignoreCase;
}
public V get(char[] key) {
return get(key, 0, key.length);
}
public V get(char[] key, int off, int len) {
return (V)values[getSlot(key, off, len)];
}
public V get(CharSequence key) {
return (V)values[getSlot(key)];
}
@Override
public V get(Object key) {
return (V)values[getSlot(key)];
}
@Override
public boolean containsKey(Object s) {
return keys[getSlot(s)] != null;
}
@Override
public boolean containsValue(Object value) {
if (value == null) {
// search for key with a null value
for (int i=0; i<keys.length; i++) {
if (keys[i] != null && values[i] == null) return true;
}
return false;
}
for (int i=0; i<values.length; i++) {
Object val = values[i];
if (val != null && value.equals(val)) return true;
}
return false;
}
private int getSlot(Object key) {
if (key instanceof char[]) {
char[] keyc = (char[])key;
return getSlot(keyc, 0, keyc.length);
}
return getSlot((CharSequence)key);
}
private int getSlot(char[] key, int off, int len) {
int code = getHashCode(key, len);
int pos = code & (keys.length-1);
char[] key2 = keys[pos];
if (key2 != null && !equals(key, off, len, key2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (keys.length-1);
key2 = keys[pos];
} while (key2 != null && !equals(key, off, len, key2));
}
return pos;
}
/** Returns true if the String is in the set */
private int getSlot(CharSequence key) {
int code = getHashCode(key);
int pos = code & (keys.length-1);
char[] key2 = keys[pos];
if (key2 != null && !equals(key, key2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (keys.length-1);
key2 = keys[pos];
} while (key2 != null && !equals(key, key2));
}
return pos;
}
public V put(CharSequence key, V val) {
return put(key.toString(), val); // could be more efficient
}
@Override
public V put(String key, V val) {
return put(key.toCharArray(), val);
}
/** Add this key,val pair to the map.
* The char[] key is directly used, no copy is made.
* If ignoreCase is true for this Map, the key array will be directly modified.
* The user should never modify the key after calling this method.
*/
public V put(char[] key, Object val) {
if (ignoreCase)
for(int i=0;i< key.length;i++)
key[i] = Character.toLowerCase(key[i]);
int slot = getSlot(key, 0, key.length);
if (keys[slot] == null) count++;
Object prev = values[slot];
keys[slot] = key;
values[slot] = val;
if (count + (count>>2) >= keys.length) {
rehash();
}
return (V)prev;
}
private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1[off+i]) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1[off+i] != text2[i])
return false;
}
}
return true;
}
private boolean equals(CharSequence text1, char[] text2) {
int len = text1.length();
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1.charAt(i) != text2[i])
return false;
}
}
return true;
}
private void rehash() {
final int newSize = 2* keys.length;
char[][] oldEntries = keys;
Object[] oldValues = values;
keys = new char[newSize][];
values = new Object[newSize];
for(int i=0;i<oldEntries.length;i++) {
char[] key = oldEntries[i];
if (key != null) {
// todo: could be faster... no need to compare keys on collision
// since they are unique
int newSlot = getSlot(key,0,key.length);
keys[newSlot] = key;
values[newSlot] = oldValues[i];
}
}
}
private int getHashCode(char[] text, int len) {
int code = 0;
if (ignoreCase) {
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text[i]);
}
} else {
for (int i=0; i<len; i++) {
code = code*31 + text[i];
}
}
return code;
}
private int getHashCode(CharSequence text) {
int code;
if (ignoreCase) {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text.charAt(i));
}
} else {
if (false && text instanceof String) {
code = text.hashCode();
} else {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + text.charAt(i);
}
}
}
return code;
}
@Override
public int size() {
return count;
}
@Override
public boolean isEmpty() {
return count==0;
}
@Override
public void clear() {
count = 0;
Arrays.fill(keys,null);
Arrays.fill(values,null);
}
@Override
public Set<Entry<String, V>> entrySet() {
return new EntrySet();
}
/** Returns an EntryIterator over this Map. */
public EntryIterator iterator() {
return new EntryIterator();
}
/** public iterator class so efficient methods are exposed to users */
public class EntryIterator implements Iterator<Map.Entry<String,V>> {
int pos=-1;
int lastPos;
EntryIterator() {
goNext();
}
private void goNext() {
lastPos = pos;
pos++;
while (pos < keys.length && keys[pos] == null) pos++;
}
public boolean hasNext() {
return pos < keys.length;
}
/** gets the next key... do not modify the returned char[] */
public char[] nextKey() {
goNext();
return keys[lastPos];
}
/** gets the next key as a newly created String object */
public String nextKeyString() {
return new String(nextKey());
}
/** returns the value associated with the last key returned */
public V currentValue() {
return (V)values[lastPos];
}
/** sets the value associated with the last key returned */
public V setValue(V value) {
V old = (V)values[lastPos];
values[lastPos] = value;
return old;
}
/** Returns an Entry<String,V> object created on the fly...
* use nextCharArray() + currentValie() for better efficiency. */
public Map.Entry<String,V> next() {
goNext();
return new MapEntry(lastPos);
}
public void remove() {
throw new UnsupportedOperationException();
}
}
private class MapEntry implements Map.Entry<String,V> {
final int pos;
MapEntry(int pos) {
this.pos = pos;
}
public char[] getCharArr() {
return keys[pos];
}
public String getKey() {
return new String(getCharArr());
}
public V getValue() {
return (V)values[pos];
}
public V setValue(V value) {
V old = (V)values[pos];
values[pos] = value;
return old;
}
public String toString() {
return getKey() + '=' + getValue();
}
}
private class EntrySet extends AbstractSet<Map.Entry<String, V>> {
public EntryIterator iterator() {
return new EntryIterator();
}
public boolean contains(Object o) {
if (!(o instanceof Map.Entry))
return false;
Map.Entry e = (Map.Entry)o;
Object key = e.getKey();
if (key==null) return false; // we don't support null keys
Object val = e.getValue();
Object v = get(key);
return v==null ? val==null : v.equals(val);
}
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
public int size() {
return count;
}
public void clear() {
CharArrayMap.this.clear();
}
}
@Override
public Object clone() {
CharArrayMap<V> map = null;
try {
map = (CharArrayMap<V>)super.clone();
map.keys = keys.clone();
map.values = values.clone();
} catch (CloneNotSupportedException e) {
// impossible
}
return map;
}
}

View File

@ -21,13 +21,18 @@ import java.util.Collections;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
import org.apache.solr.core.Config;
/**
* General token testing helper functions
*/
public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase
{
/** a map containing the default test version param for easy testing */
protected static final Map<String,String> DEFAULT_VERSION_PARAM =
Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT"));
/** The default test version for easy testing */
public static final Version DEFAULT_VERSION = Config.parseLuceneVersionString(DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
}

View File

@ -39,12 +39,12 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("words", "stop-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2);
@ -71,13 +71,13 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });

View File

@ -35,10 +35,10 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
public void testReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
TermAttribute term = cgf.addAttribute(TermAttribute.class);
assertTrue(cgf.incrementToken());
assertEquals("How", term.term());
assertTrue(cgf.incrementToken());
@ -56,11 +56,11 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
TermAttribute term = wt.addAttribute(TermAttribute.class);
assertTrue(nsf.incrementToken());
assertEquals("How_the", term.term());
assertTrue(nsf.incrementToken());
@ -88,7 +88,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords));
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
}
};
@ -157,7 +157,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords);
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
}
};
@ -243,7 +243,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testCaseSensitive() throws Exception {
final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
Set common = CommonGramsFilter.makeCommonSet(commonWords);
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
@ -256,7 +256,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testLastWordisStopWord() throws Exception {
final String input = "dog the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "dog_the" });
@ -267,7 +267,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testFirstWordisStopWord() throws Exception {
final String input = "the dog";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_dog" });
@ -278,7 +278,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testOneWordQueryStopWord() throws Exception {
final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" });
@ -289,7 +289,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testOneWordQuery() throws Exception {
final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" });
@ -300,7 +300,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" });

View File

@ -38,12 +38,12 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("words", "stop-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2);
@ -70,13 +70,13 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
Set<?> words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "testing_the", "the_factory" });

View File

@ -29,7 +29,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
public void testDefaults() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
@ -43,7 +43,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
parameters.put("maxCodeLength", "8");
factory.init(parameters);
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
@ -56,10 +56,10 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
public void testReset() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream);
TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
TermAttribute termAtt = filteredStream.addAttribute(TermAttribute.class);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
assertTrue(filteredStream.incrementToken());

View File

@ -24,42 +24,42 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
public void testSize4FalseInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "ANTR" });
}
public void testSize4TrueInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
}
public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
}
public void testSize8FalseInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
}
public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
}
public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
// should have something after the stream
stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
}

View File

@ -46,11 +46,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
Tokenizer tokenizer = new WhitespaceTokenizer(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
@ -71,13 +71,13 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
Tokenizer tokenizer = new WhitespaceTokenizer(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);

View File

@ -33,7 +33,7 @@ public class LengthFilterTest extends BaseTokenTestCase {
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
factory.init(args);
String test = "foo foobar super-duper-trooper";
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" });
}
}

View File

@ -48,12 +48,12 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
}
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("language", "English");
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
Tokenizer tokenizer = new WhitespaceTokenizer(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
@ -78,13 +78,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
factory.init(args);
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
Tokenizer tokenizer = new WhitespaceTokenizer(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
@ -116,13 +116,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
public void testProtected() throws Exception {
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
ResourceLoader loader = new SolrResourceLoader(null, null);
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put("protected", "protwords.txt");
args.put("language", "English");
factory.init(args);
factory.inform(loader);
Reader reader = new StringReader("ridding of some stemming");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
}

View File

@ -33,7 +33,7 @@ public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Brasília");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "brasil" });

View File

@ -59,7 +59,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now Q B brown A cow B like Q B thing?";
TokenStream ts = new AB_Q_Stream
(new WhitespaceTokenizer(new StringReader(input)));
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts, expected.split("\\s"));
}
@ -67,15 +67,15 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now A A B brown A cow B like A A B thing?";
TokenStream ts = new AB_AAB_Stream
(new WhitespaceTokenizer(new StringReader(input)));
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testReset() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
TokenStream ts = new AB_AAB_Stream(tokenizer);
TermAttribute term = (TermAttribute) ts.addAttribute(TermAttribute.class);
TermAttribute term = ts.addAttribute(TermAttribute.class);
assertTrue(ts.incrementToken());
assertEquals("How", term.term());
assertTrue(ts.incrementToken());

View File

@ -33,7 +33,7 @@ public class TestBulgarianStemFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("компютри");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "компютр" });

View File

@ -34,7 +34,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
public void testCapitalization() throws Exception
{
Map<String,String> args = new HashMap<String, String>();
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" );
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
@ -74,18 +74,18 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
// now each token
factory.onlyFirstWord = false;
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
// now only the long words
factory.minWordLength = 3;
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
// without prefix
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Mckinley" });
@ -93,14 +93,14 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
factory = new CapitalizationFilterFactory();
args.put( "okPrefix", "McK" ); // all words
factory.init( args );
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "McKinley" });
// now try some stuff with numbers
factory.forceFirstLetter = false;
factory.onlyFirstWord = false;
tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
@ -111,7 +111,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
}
public void testKeepIgnoreCase() throws Exception {
Map<String,String> args = new HashMap<String, String>();
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( CapitalizationFilterFactory.KEEP, "kitten" );
args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" );
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
@ -141,12 +141,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* This is very weird when combined with ONLY_FIRST_WORD!!!
*/
public void testMinWordLength() throws Exception {
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
"helo testing"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
@ -157,11 +157,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* in each token (it should do nothing)
*/
public void testMaxWordCount() throws Exception {
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
"one two three four"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
@ -171,7 +171,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
*/
public void testMaxWordCount2() throws Exception {
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
@ -187,11 +187,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* This is weird, it is not really a max, but inclusive (look at 'is')
*/
public void testMaxTokenLength() throws Exception {
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
"this is a test"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
@ -201,12 +201,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
* Test CapitalizationFilterFactory's forceFirstLetter option
*/
public void testForceFirstLetter() throws Exception {
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put(CapitalizationFilterFactory.KEEP, "kitten");
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"Kitten"});
}

View File

@ -33,7 +33,7 @@ public class TestChineseFilterFactory extends BaseTokenTestCase {
*/
public void testFiltering() throws Exception {
Reader reader = new StringReader("this 1234 Is such a silly filter");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
ChineseFilterFactory factory = new ChineseFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });

View File

@ -177,9 +177,9 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
throws IOException {
TermAttribute term1 = (TermAttribute) stream1
TermAttribute term1 = stream1
.addAttribute(TermAttribute.class);
TermAttribute term2 = (TermAttribute) stream2
TermAttribute term2 = stream2
.addAttribute(TermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());

View File

@ -33,7 +33,7 @@ public class TestCzechStemFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("angličtí");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
CzechStemFilterFactory factory = new CzechStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "anglick" });

View File

@ -21,8 +21,6 @@ import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@ -32,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader;
public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenTestCase {
public void testEncoder() throws Exception {
Map<String,String> args = new HashMap<String, String>();
@ -42,10 +40,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
ResourceLoader loader = new SolrResourceLoader(null, null);
factory.inform(loader);
TokenStream input = new WhitespaceTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1"));
TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the|0.1 quick|0.1 red|0.1"));
DelimitedPayloadTokenFilter tf = factory.create(input);
while (tf.incrementToken()){
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class);
PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
byte[] payData = payAttr.getPayload().getData();
assertTrue("payData is null and it shouldn't be", payData != null);
@ -64,10 +62,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
ResourceLoader loader = new SolrResourceLoader(null, null);
factory.inform(loader);
TokenStream input = new WhitespaceTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"));
TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the*0.1 quick*0.1 red*0.1"));
DelimitedPayloadTokenFilter tf = factory.create(input);
while (tf.incrementToken()){
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class);
PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
byte[] payData = payAttr.getPayload().getData();
assertTrue("payData is null and it shouldn't be", payData != null);

View File

@ -37,10 +37,10 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestC
*/
public void testDecompounding() throws Exception {
Reader reader = new StringReader("I like to play softball");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
ResourceLoader loader = new SolrResourceLoader(null, null);
Map<String,String> args = new HashMap<String,String>();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
args.put("dictionary", "compoundDictionary.txt");
factory.init(args);
factory.inform(loader);

View File

@ -33,7 +33,7 @@ public class TestDutchStemFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("lichamelijkheden");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
DutchStemFilterFactory factory = new DutchStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "licham" });

View File

@ -37,7 +37,7 @@ public class TestElisionFilterFactory extends BaseTokenTestCase {
*/
public void testElision() throws Exception {
Reader reader = new StringReader("l'avion");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
ElisionFilterFactory factory = new ElisionFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);

View File

@ -33,7 +33,7 @@ public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("habitable");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "habit" });

View File

@ -33,7 +33,7 @@ public class TestGermanStemFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Tischen");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
GermanStemFilterFactory factory = new GermanStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "tisch" });

View File

@ -33,7 +33,7 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });

View File

@ -29,7 +29,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
assertTokenStreamContents(ts,
@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
assertTokenStreamContents(ts,

View File

@ -23,25 +23,22 @@ import java.util.Set;
import java.util.Map;
import java.util.HashMap;
import junit.framework.TestCase;
/**
*
*
**/
public class TestKeepFilterFactory extends TestCase{
public class TestKeepFilterFactory extends BaseTokenTestCase{
public void testInform() throws Exception {
ResourceLoader loader = new SolrResourceLoader(null, null);
assertTrue("loader is null and it shouldn't be", loader != null);
KeepWordFilterFactory factory = new KeepWordFilterFactory();
Map<String, String> args = new HashMap<String, String>();
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put("words", "keep-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getWords();
Set<?> words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);

View File

@ -41,7 +41,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
words.add( "bbb" );
String input = "aaa BBB ccc ddd EEE";
Map<String,String> args = new HashMap<String, String>();
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);
// Test Stopwords
@ -51,29 +51,29 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
factory.inform( loader );
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Test Stopwords (ignoreCase via the setter instead)
factory = new KeepWordFilterFactory();
args = new HashMap<String, String>();
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init( args );
factory.inform( loader );
factory.setIgnoreCase(true);
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Now force case
factory = new KeepWordFilterFactory();
args = new HashMap<String, String>();
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
args.put( "ignoreCase", "false" );
factory.init( args );
factory.inform( loader );
factory.setWords( words );
assertFalse(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa" });
}
}

View File

@ -0,0 +1,65 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader;
/**
* Simple tests to ensure the keyword marker filter factory is working.
*/
public class TestKeywordMarkerFilterFactory extends BaseTokenTestCase {
public void testKeywords() throws IOException {
Reader reader = new StringReader("dogs cats");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);
args.put("protected", "protwords.txt");
factory.init(args);
factory.inform(loader);
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
}
public void testKeywordsCaseInsensitive() throws IOException {
Reader reader = new StringReader("dogs cats Cats");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
ResourceLoader loader = new SolrResourceLoader(null, null);
args.put("protected", "protwords.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
}
}

View File

@ -20,7 +20,7 @@ public class TestMultiWordSynonyms extends BaseTokenTestCase {
SynonymMap synMap = new SynonymMap(true);
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
// This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" });
}

Some files were not shown because too many files have changed in this diff Show More