mirror of https://github.com/apache/lucene.git
SOLR-1857: cleanup and sync analysis with Lucene trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@929782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a528a707c1
commit
3860c16a66
|
@ -126,6 +126,14 @@ New Features
|
|||
|
||||
* SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble)
|
||||
|
||||
* SOLR-1857: Synced Solr analysis with Lucene 3.1. Added KeywordMarkerFilterFactory
|
||||
and StemmerOverrideFilterFactory, which can be used to tune stemming algorithms.
|
||||
Added factories for Bulgarian, Czech, Hindi, and Turkish analysis. Improved the
|
||||
performance of SnowballPorterFilterFactory. (rmuir)
|
||||
|
||||
* SOLR-1657: Converted remaining TokenStreams to the Attributes-based API. All Solr
|
||||
TokenFilters now support custom Attributes, and some have improved performance:
|
||||
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
|
|
@ -18,9 +18,10 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/** Factory for {@link ASCIIFoldingFilter} */
|
||||
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
|
||||
public ASCIIFoldingFilter create(TokenStream input) {
|
||||
return new ASCIIFoldingFilter(input);
|
||||
|
|
|
@ -16,15 +16,13 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Factory for {@link ArabicLetterTokenizer}
|
||||
**/
|
||||
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
|
||||
|
||||
|
|
|
@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
|||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Factory for {@link ArabicNormalizationFilter}
|
||||
**/
|
||||
public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{
|
||||
|
||||
|
|
|
@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
|||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Factory for {@link ArabicStemFilter}
|
||||
**/
|
||||
public class ArabicStemFilterFactory extends BaseTokenFilterFactory{
|
||||
|
||||
|
|
|
@ -17,13 +17,17 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.core.Config;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
||||
|
@ -94,4 +98,22 @@ abstract class BaseTokenStreamFactory {
|
|||
return Boolean.parseBoolean(s);
|
||||
}
|
||||
|
||||
protected CharArraySet getWordSet(ResourceLoader loader,
|
||||
String wordFiles, boolean ignoreCase) throws IOException {
|
||||
assureMatchVersion();
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
CharArraySet words = null;
|
||||
if (files.size() > 0) {
|
||||
// default stopwords list has 35 or so words, but maybe don't make it that
|
||||
// big to start
|
||||
words = new CharArraySet(luceneMatchVersion,
|
||||
files.size() * 10, ignoreCase);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
|
||||
ignoreCase));
|
||||
}
|
||||
}
|
||||
return words;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,15 +18,10 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.br.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||
|
||||
/** Factory for {@link BrazilianStemFilter} */
|
||||
public class BrazilianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public BrazilianStemFilter create(TokenStream in) {
|
||||
return new BrazilianStemFilter(in);
|
||||
|
|
|
@ -73,12 +73,12 @@ public abstract class BufferedTokenStream extends TokenFilter {
|
|||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||
|
||||
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public BufferedTokenStream(TokenStream input) {
|
||||
super(input);
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
|
||||
|
||||
/** Factory for BulgarianStemFilter */
|
||||
/** Factory for {@link BulgarianStemFilter} */
|
||||
public class BulgarianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new BulgarianStemFilter(input);
|
||||
|
|
|
@ -18,11 +18,11 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.cjk.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import org.apache.lucene.analysis.cjk.CJKTokenizer;
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link CJKTokenizer} */
|
||||
public class CJKTokenizerFactory extends BaseTokenizerFactory {
|
||||
public CJKTokenizer create(Reader in) {
|
||||
return new CJKTokenizer(in);
|
||||
|
|
|
@ -75,6 +75,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
|||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
|
||||
String k = args.get(KEEP);
|
||||
if (k != null) {
|
||||
|
@ -84,7 +85,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
|||
if ("true".equalsIgnoreCase(ignoreStr)) {
|
||||
ignoreCase = true;
|
||||
}
|
||||
keep = new CharArraySet(10, ignoreCase);
|
||||
keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
|
||||
while (st.hasMoreTokens()) {
|
||||
k = st.nextToken().trim();
|
||||
keep.add(k.toCharArray());
|
||||
|
@ -194,7 +195,7 @@ class CapitalizationFilter extends TokenFilter {
|
|||
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
||||
super(in);
|
||||
this.factory = factory;
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,10 +18,14 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.cn.*;
|
||||
import java.util.Hashtable;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cn.ChineseFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link ChineseFilter}
|
||||
* @deprecated Use {@link StopFilterFactory} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public class ChineseFilterFactory extends BaseTokenFilterFactory {
|
||||
public ChineseFilter create(TokenStream in) {
|
||||
return new ChineseFilter(in);
|
||||
|
|
|
@ -18,10 +18,15 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.cn.*;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.cn.ChineseTokenizer;
|
||||
|
||||
/**
|
||||
* Factory for {@link ChineseTokenizer}
|
||||
* @deprecated Use {@link StandardTokenizerFactory} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public class ChineseTokenizerFactory extends BaseTokenizerFactory {
|
||||
public ChineseTokenizer create(Reader in) {
|
||||
return new ChineseTokenizer(in);
|
||||
|
|
|
@ -20,6 +20,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/*
|
||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
|
||||
|
@ -51,15 +52,25 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
|
||||
private final StringBuilder buffer = new StringBuilder();
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private int lastStartOffset;
|
||||
private boolean lastWasCommon;
|
||||
private State savedState;
|
||||
|
||||
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */
|
||||
public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
|
||||
this(Version.LUCENE_29, input, commonWords);
|
||||
}
|
||||
|
||||
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */
|
||||
public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean ignoreCase) {
|
||||
this(Version.LUCENE_29, input, commonWords, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using a Set of common
|
||||
* words to create bigrams. Outputs both unigrams with position increment and
|
||||
|
@ -69,8 +80,8 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param input TokenStream input in filter chain
|
||||
* @param commonWords The set of common words.
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords) {
|
||||
this(input, commonWords, false);
|
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
|
||||
this(matchVersion, input, commonWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -90,12 +101,12 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param commonWords The set of common words.
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) {
|
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
|
||||
super(input);
|
||||
if (commonWords instanceof CharArraySet) {
|
||||
this.commonWords = (CharArraySet) commonWords;
|
||||
} else {
|
||||
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
|
||||
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
|
||||
this.commonWords.addAll(commonWords);
|
||||
}
|
||||
}
|
||||
|
@ -106,7 +117,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
*
|
||||
* @param input Tokenstream in filter chain
|
||||
* @param commonWords words to be used in constructing bigrams
|
||||
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords) {
|
||||
this(input, commonWords, false);
|
||||
}
|
||||
|
@ -118,7 +131,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param input Tokenstream in filter chain
|
||||
* @param commonWords words to be used in constructing bigrams
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
|
||||
super(input);
|
||||
this.commonWords = makeCommonSet(commonWords, ignoreCase);
|
||||
|
@ -132,7 +147,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param commonWords Array of common words which will be converted into the CharArraySet
|
||||
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
|
||||
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @deprecated create a CharArraySet with CharArraySet instead
|
||||
*/
|
||||
@Deprecated
|
||||
public static CharArraySet makeCommonSet(String[] commonWords) {
|
||||
return makeCommonSet(commonWords, false);
|
||||
}
|
||||
|
@ -145,7 +162,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param commonWords Array of common words which will be converted into the CharArraySet
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
* @deprecated create a CharArraySet with CharArraySet instead
|
||||
*/
|
||||
@Deprecated
|
||||
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
|
||||
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
|
||||
commonSet.addAll(Arrays.asList(commonWords));
|
||||
|
|
|
@ -17,14 +17,12 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
|
@ -43,16 +41,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
|||
|
||||
if (commonWordFiles != null) {
|
||||
try {
|
||||
List<String> files = StrUtils.splitFileNames(commonWordFiles);
|
||||
if (commonWords == null && files.size() > 0){
|
||||
//default stopwords list has 35 or so words, but maybe don't make it that big to start
|
||||
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
||||
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -69,12 +58,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
|||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set getCommonWords() {
|
||||
public Set<?> getCommonWords() {
|
||||
return commonWords;
|
||||
}
|
||||
|
||||
public CommonGramsFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase);
|
||||
return commonGrams;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,8 +47,8 @@ import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
|
|||
*/
|
||||
public final class CommonGramsQueryFilter extends TokenFilter {
|
||||
|
||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private State previous;
|
||||
private String previousType;
|
||||
|
|
|
@ -17,14 +17,13 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
|
@ -36,25 +35,19 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
|||
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||
implements ResourceLoaderAware {
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String commonWordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
|
||||
if (commonWordFiles != null) {
|
||||
try {
|
||||
List<String> files = StrUtils.splitFileNames(commonWordFiles);
|
||||
if (commonWords == null && files.size() > 0) {
|
||||
// default stopwords list has 35 or so words, but maybe don't make it
|
||||
// that big to start
|
||||
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
// TODO: once StopFilter.makeStopSet(List) method is available, switch
|
||||
// to using that so we can avoid a toArray() call
|
||||
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
|
||||
.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -73,7 +66,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
|||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set getCommonWords() {
|
||||
public Set<?> getCommonWords() {
|
||||
return commonWords;
|
||||
}
|
||||
|
||||
|
@ -81,7 +74,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
|||
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
||||
*/
|
||||
public CommonGramsQueryFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords,
|
||||
ignoreCase);
|
||||
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
||||
commonGrams);
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
|
||||
/** Factory for CzechStemFilter */
|
||||
/** Factory for {@link CzechStemFilter} */
|
||||
public class CzechStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new CzechStemFilter(input);
|
||||
|
|
|
@ -31,7 +31,7 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Factory for {@link DelimitedPayloadTokenFilter}
|
||||
**/
|
||||
public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String ENCODER_ATTR = "encoder";
|
||||
|
|
|
@ -18,20 +18,18 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.compound.*;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
/** Factory for {@link DictionaryCompoundWordTokenFilter} */
|
||||
public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
private Set dictionary;
|
||||
private CharArraySet dictionary;
|
||||
private String dictFile;
|
||||
private int minWordSize;
|
||||
private int minSubwordSize;
|
||||
|
@ -39,6 +37,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
|
|||
private boolean onlyLongestMatch;
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
dictFile = args.get("dictionary");
|
||||
if (null == dictFile) {
|
||||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
|
||||
|
@ -52,14 +51,13 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
|
|||
}
|
||||
public void inform(ResourceLoader loader) {
|
||||
try {
|
||||
List<String> wlist = loader.getLines(dictFile);
|
||||
dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
|
||||
dictionary = super.getWordSet(loader, dictFile, false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
public DictionaryCompoundWordTokenFilter create(TokenStream input) {
|
||||
return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
|
||||
return new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,11 +20,9 @@ import java.io.IOException;
|
|||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
public class DoubleMetaphoneFilter extends TokenFilter {
|
||||
|
@ -41,8 +39,8 @@ public class DoubleMetaphoneFilter extends TokenFilter {
|
|||
super(input);
|
||||
this.encoder.setMaxCodeLen(maxCodeLength);
|
||||
this.inject = inject;
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,19 +18,19 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.nl.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link SnowballPorterFilterFactory} with "Dutch" instead,
|
||||
* which has the same functionality.
|
||||
*/
|
||||
@Deprecated
|
||||
public class DutchStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public DutchStemFilter create(TokenStream _in) {
|
||||
return new DutchStemFilter(_in);
|
||||
public TokenFilter create(TokenStream _in) {
|
||||
return new SnowballFilter(_in, new org.tartarus.snowball.ext.DutchStemmer());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,32 +21,22 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.fr.*;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
|
||||
/** Factory for {@link ElisionFilter} */
|
||||
public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
private Set articles;
|
||||
private CharArraySet articles;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String articlesFile = args.get("articles");
|
||||
|
||||
if (articlesFile != null) {
|
||||
try {
|
||||
List<String> wlist = loader.getLines(articlesFile);
|
||||
articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
|
||||
articles = getWordSet(loader, articlesFile, false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -18,17 +18,14 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
|
@ -42,21 +39,7 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
|
|||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
File protectedWordFiles = new File(wordFiles);
|
||||
if (protectedWordFiles.exists()) {
|
||||
List<String> wlist = loader.getLines(wordFiles);
|
||||
//This cast is safe in Lucene
|
||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
if (protectedWords == null)
|
||||
protectedWords = new CharArraySet(wlist, false);
|
||||
else
|
||||
protectedWords.addAll(wlist);
|
||||
}
|
||||
}
|
||||
protectedWords = getWordSet(loader, wordFiles, false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -65,20 +48,10 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
|
|||
|
||||
private CharArraySet protectedWords = null;
|
||||
|
||||
public EnglishPorterFilter create(TokenStream input) {
|
||||
return new EnglishPorterFilter(input, protectedWords);
|
||||
public TokenFilter create(TokenStream input) {
|
||||
if (protectedWords != null)
|
||||
input = new KeywordMarkerTokenFilter(input, protectedWords);
|
||||
return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* English Porter2 filter that doesn't use reflection to
|
||||
* adapt lucene to the snowball stemmer code.
|
||||
*/
|
||||
@Deprecated
|
||||
class EnglishPorterFilter extends SnowballPorterFilter {
|
||||
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
|
||||
super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,18 +18,19 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.fr.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link SnowballPorterFilterFactory} with "French" instead,
|
||||
* which has the same functionality.
|
||||
*/
|
||||
@Deprecated
|
||||
public class FrenchStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public FrenchStemFilter create(TokenStream in) {
|
||||
return new FrenchStemFilter(in);
|
||||
public TokenFilter create(TokenStream in) {
|
||||
return new SnowballFilter(in, new org.tartarus.snowball.ext.FrenchStemmer());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,13 +18,11 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.de.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
||||
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link GermanStemFilter} */
|
||||
public class GermanStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public GermanStemFilter create(TokenStream in) {
|
||||
return new GermanStemFilter(in);
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
|
|||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
||||
/** Factory for {@link GreekLowerCaseFilter} */
|
||||
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
||||
{
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
||||
|
||||
/** Factory for HindiNormalizationFilter */
|
||||
/** Factory for {@link HindiNormalizationFilter} */
|
||||
public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new HindiNormalizationFilter(input);
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.hi.HindiStemFilter;
|
||||
|
||||
/** Factory for HindiStemFilter */
|
||||
/** Factory for {@link HindiStemFilter} */
|
||||
public class HindiStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new HindiStemFilter(input);
|
||||
|
|
|
@ -54,8 +54,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
*/
|
||||
public final class HyphenatedWordsFilter extends TokenFilter {
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final StringBuilder hyphenated = new StringBuilder();
|
||||
private State savedState;
|
||||
|
|
|
@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.solr.analysis.BaseTokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for HyphenatedWordsFilter
|
||||
* Factory for {@link HyphenatedWordsFilter}
|
||||
*/
|
||||
public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
|
||||
public HyphenatedWordsFilter create(TokenStream input) {
|
||||
|
|
|
@ -21,8 +21,10 @@ import org.apache.lucene.analysis.ISOLatin1AccentFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/** Factory for ISOLatin1AccentFilter
|
||||
* @deprecated Use {@link ASCIIFoldingFilterFactory} instead.
|
||||
* $Id$
|
||||
*/
|
||||
@Deprecated
|
||||
public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory {
|
||||
public ISOLatin1AccentFilter create(TokenStream input) {
|
||||
return new ISOLatin1AccentFilter(input);
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
|
||||
/** Factory for IndicNormalizationFilter */
|
||||
/** Factory for {@link IndicNormalizationFilter} */
|
||||
public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new IndicNormalizationFilter(input);
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.in.IndicTokenizer;
|
||||
|
||||
/** Factory for IndicTokenizer */
|
||||
/** Factory for {@link IndicTokenizer} */
|
||||
public class IndicTokenizerFactory extends BaseTokenizerFactory {
|
||||
public Tokenizer create(Reader input) {
|
||||
assureMatchVersion();
|
||||
|
|
|
@ -19,10 +19,8 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
@ -38,6 +36,8 @@ public final class KeepWordFilter extends TokenFilter {
|
|||
private final CharArraySet words;
|
||||
private final TermAttribute termAtt;
|
||||
|
||||
/** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
|
||||
@Deprecated
|
||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||
this(in, new CharArraySet(words, ignoreCase));
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ public final class KeepWordFilter extends TokenFilter {
|
|||
public KeepWordFilter(TokenStream in, CharArraySet words) {
|
||||
super(in);
|
||||
this.words = words;
|
||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,17 +18,11 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.io.File;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
|
@ -40,23 +34,13 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
|||
private CharArraySet words;
|
||||
private boolean ignoreCase;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
if (wordFiles != null) {
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
if (words == null && files.size() > 0){
|
||||
words = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
||||
words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
words = getWordSet(loader, wordFiles, ignoreCase);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
@ -67,14 +51,14 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
|||
* NOTE: if ignoreCase==true, the words are expected to be lowercase
|
||||
*/
|
||||
public void setWords(Set<String> words) {
|
||||
this.words = new CharArraySet(words, ignoreCase);
|
||||
this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
|
||||
}
|
||||
|
||||
public void setIgnoreCase(boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
if (words != null) {
|
||||
words = new CharArraySet(words, ignoreCase);
|
||||
public void setIgnoreCase(boolean ignoreCase) {
|
||||
if (words != null && this.ignoreCase != ignoreCase) {
|
||||
words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
|
||||
}
|
||||
this.ignoreCase = ignoreCase;
|
||||
}
|
||||
|
||||
public KeepWordFilter create(TokenStream input) {
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Factory for {@link KeywordMarkerTokenFilter}
|
||||
*/
|
||||
public class KeywordMarkerFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
private CharArraySet protectedWords;
|
||||
private boolean ignoreCase;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return protectedWords == null ? input : new KeywordMarkerTokenFilter(input, protectedWords);
|
||||
}
|
||||
}
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
|
|
|
@ -17,17 +17,23 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.LetterTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class LetterTokenizerFactory extends BaseTokenizerFactory {
|
||||
public LetterTokenizer create(Reader input) {
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public LetterTokenizer create(Reader input) {
|
||||
return new LetterTokenizer(luceneMatchVersion, input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
|
||||
|
@ -24,8 +26,13 @@ import org.apache.lucene.analysis.LowerCaseFilter;
|
|||
* @version $Id$
|
||||
*/
|
||||
public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||
public LowerCaseFilter create(TokenStream input) {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public LowerCaseFilter create(TokenStream input) {
|
||||
return new LowerCaseFilter(luceneMatchVersion,input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,17 +17,22 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
|
||||
public LowerCaseTokenizer create(Reader input) {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public LowerCaseTokenizer create(Reader input) {
|
||||
return new LowerCaseTokenizer(luceneMatchVersion,input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,13 +18,12 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
||||
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link NumericPayloadTokenFilter} */
|
||||
public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
private float payload;
|
||||
private String typeMatch;
|
||||
|
|
|
@ -19,13 +19,10 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
import java.nio.CharBuffer;
|
||||
|
||||
|
@ -66,7 +63,7 @@ public final class PatternReplaceFilter extends TokenFilter {
|
|||
this.p=p;
|
||||
this.replacement = (null == replacement) ? "" : replacement;
|
||||
this.all=all;
|
||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -56,8 +56,8 @@ import org.apache.commons.io.IOUtils;
|
|||
*/
|
||||
public final class PatternTokenizer extends Tokenizer {
|
||||
|
||||
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private String str;
|
||||
private int index;
|
||||
|
|
|
@ -18,12 +18,11 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.fa.*;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link PersianNormalizationFilter} */
|
||||
public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory {
|
||||
public PersianNormalizationFilter create(TokenStream input) {
|
||||
return new PersianNormalizationFilter(input);
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.solr.analysis;
|
|||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
|
@ -47,8 +46,8 @@ public class PhoneticFilter extends TokenFilter
|
|||
this.encoder = encoder;
|
||||
this.name = name;
|
||||
this.inject = inject;
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,11 +17,12 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.util.CharArrayMap;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -30,12 +31,11 @@ import java.io.IOException;
|
|||
*/
|
||||
public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
// keep a seen 'set' after each term with posInc > 0
|
||||
// for now use CharArrayMap vs CharArraySet, as it has clear()
|
||||
private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
|
||||
// use a fixed version, as we don't care about case sensitivity.
|
||||
private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
|
||||
|
||||
/**
|
||||
* Creates a new RemoveDuplicatesTokenFilter
|
||||
|
@ -60,12 +60,12 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
|||
previous.clear();
|
||||
}
|
||||
|
||||
boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null);
|
||||
boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length));
|
||||
|
||||
// clone the term, and add to the set of seen terms.
|
||||
char saved[] = new char[length];
|
||||
System.arraycopy(term, 0, saved, 0, length);
|
||||
previous.put(saved, Boolean.TRUE);
|
||||
previous.add(saved);
|
||||
|
||||
if (!duplicate) {
|
||||
return true;
|
||||
|
|
|
@ -45,8 +45,8 @@ public class ReversedWildcardFilter extends TokenFilter {
|
|||
|
||||
protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
|
||||
super(input);
|
||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
this.withOriginal = withOriginal;
|
||||
this.markerChar = markerChar;
|
||||
}
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
//package org.apache.solr.analysis;
|
||||
//import org.apache.lucene.analysis.ru.*;
|
||||
//import java.util.Map;
|
||||
//import java.util.HashMap;
|
||||
//import org.apache.solr.core.SolrConfig;
|
||||
//import org.apache.solr.common.SolrException;
|
||||
//import org.apache.solr.common.SolrException.ErrorCode;
|
||||
//import org.slf4j.Logger;
|
||||
//import org.slf4j.LoggerFactory;
|
||||
//
|
||||
//@Deprecated
|
||||
//public class RussianCommon {
|
||||
//
|
||||
// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
|
||||
//
|
||||
// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
|
||||
// static {
|
||||
// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
|
||||
// CHARSETS.put("KOI8",RussianCharsets.KOI8);
|
||||
// CHARSETS.put("CP1251",RussianCharsets.CP1251);
|
||||
// }
|
||||
//
|
||||
// public static char[] getCharset(String name) {
|
||||
// if (null == name)
|
||||
// return RussianCharsets.UnicodeRussian;
|
||||
//
|
||||
// char[] charset = CHARSETS.get(name);
|
||||
//
|
||||
// if (charset.equals(RussianCharsets.UnicodeRussian))
|
||||
// logger.warn("Specifying UnicodeRussian is no longer required (default). "
|
||||
// + "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
// else
|
||||
// logger.warn("Support for this custom encoding is deprecated. "
|
||||
// + "Use of the charset parameter will cause an error in Solr 1.5");
|
||||
//
|
||||
// if (null == charset) {
|
||||
// throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
// "Don't understand charset: " + name);
|
||||
// }
|
||||
// return charset;
|
||||
// }
|
||||
//}
|
||||
|
|
@ -24,6 +24,10 @@ import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
|
|||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
||||
/** @deprecated Use {@link StandardTokenizerFactory} instead.
|
||||
* This tokenizer has no Russian-specific functionality.
|
||||
*/
|
||||
@Deprecated
|
||||
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,11 +19,17 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
||||
/** @deprecated Use {@link LowerCaseFilterFactory} instead which has the
|
||||
* same functionality.
|
||||
*/
|
||||
@Deprecated
|
||||
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||
|
||||
@Override
|
||||
|
@ -35,8 +41,9 @@ public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
|||
+ "Please process your documents as Unicode instead.");
|
||||
}
|
||||
|
||||
public RussianLowerCaseFilter create(TokenStream in) {
|
||||
return new RussianLowerCaseFilter(in);
|
||||
public TokenFilter create(TokenStream in) {
|
||||
// hardcode the version to give exactly the old behavior
|
||||
return new LowerCaseFilter(Version.LUCENE_29, in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,16 +19,19 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ru.RussianStemFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link SnowballPorterFilterFactory} with "Russian" instead,
|
||||
* which has the same functionality.
|
||||
*/
|
||||
@Deprecated
|
||||
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
|
||||
|
||||
|
||||
public RussianStemFilter create(TokenStream in) {
|
||||
return new RussianStemFilter(in);
|
||||
public TokenFilter create(TokenStream in) {
|
||||
return new SnowballFilter(in, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,14 +18,12 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.shingle.*;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Iterator;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link ShingleFilter} */
|
||||
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||
private int maxShingleSize;
|
||||
private boolean outputUnigrams;
|
||||
|
|
|
@ -17,26 +17,21 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
||||
/**
|
||||
* Factory for SnowballFilters, with configurable language
|
||||
*
|
||||
* Browsing the code, SnowballFilter uses reflection to adapt to Lucene... don't
|
||||
* use this if you are concerned about speed. Use EnglishPorterFilterFactory.
|
||||
* Factory for {@link SnowballFilter}, with configurable language
|
||||
* <p>
|
||||
* Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
|
@ -44,28 +39,14 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
|
|||
public static final String PROTECTED_TOKENS = "protected";
|
||||
|
||||
private String language = "English";
|
||||
private Class stemClass;
|
||||
private Class<?> stemClass;
|
||||
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
File protectedWordFiles = new File(wordFiles);
|
||||
if (protectedWordFiles.exists()) {
|
||||
List<String> wlist = loader.getLines(wordFiles);
|
||||
//This cast is safe in Lucene
|
||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
if (protectedWords == null)
|
||||
protectedWords = new CharArraySet(wlist, false);
|
||||
else
|
||||
protectedWords.addAll(wlist);
|
||||
}
|
||||
}
|
||||
protectedWords = getWordSet(loader, wordFiles, false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -87,50 +68,17 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
|
|||
}
|
||||
}
|
||||
|
||||
public SnowballPorterFilter create(TokenStream input) {
|
||||
public TokenFilter create(TokenStream input) {
|
||||
SnowballProgram program;
|
||||
try {
|
||||
program = (SnowballProgram)stemClass.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
|
||||
}
|
||||
return new SnowballPorterFilter(input, program, protectedWords);
|
||||
|
||||
if (protectedWords != null)
|
||||
input = new KeywordMarkerTokenFilter(input, protectedWords);
|
||||
return new SnowballFilter(input, program);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class SnowballPorterFilter extends TokenFilter {
|
||||
private final CharArraySet protWords;
|
||||
private final SnowballProgram stemmer;
|
||||
private final TermAttribute termAtt;
|
||||
|
||||
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
|
||||
super(source);
|
||||
this.protWords = protWords;
|
||||
this.stemmer = stemmer;
|
||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
char[] termBuffer = termAtt.termBuffer();
|
||||
int len = termAtt.termLength();
|
||||
// if protected, don't stem. use this to avoid stemming collisions.
|
||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
stemmer.setCurrent(termBuffer, len);
|
||||
stemmer.stem();
|
||||
final char finalTerm[] = stemmer.getCurrentBuffer();
|
||||
final int newLength = stemmer.getCurrentBufferLength();
|
||||
if (finalTerm != termBuffer)
|
||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
||||
else
|
||||
termAtt.setTermLength(newLength);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,18 +17,23 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class StandardTokenizerFactory extends BaseTokenizerFactory {
|
||||
public StandardTokenizer create(Reader input) {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public StandardTokenizer create(Reader input) {
|
||||
return new StandardTokenizer(luceneMatchVersion, input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.CharArrayMap;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
* Factory for {@link StemmerOverrideFilter}
|
||||
*/
|
||||
public class StemmerOverrideFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
private CharArrayMap<String> dictionary = null;
|
||||
private boolean ignoreCase;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String dictionaryFiles = args.get("dictionary");
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
if (dictionaryFiles != null) {
|
||||
assureMatchVersion();
|
||||
List<String> files = StrUtils.splitFileNames(dictionaryFiles);
|
||||
try {
|
||||
if (files.size() > 0) {
|
||||
dictionary = new CharArrayMap<String>(luceneMatchVersion,
|
||||
files.size() * 10, ignoreCase);
|
||||
for (String file : files) {
|
||||
List<String> list = loader.getLines(file.trim());
|
||||
for (String line : list) {
|
||||
String[] mapping = line.split("\t", 2);
|
||||
dictionary.put(mapping[0], mapping[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary);
|
||||
}
|
||||
}
|
|
@ -18,18 +18,14 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
|
@ -37,6 +33,12 @@ import java.io.IOException;
|
|||
*/
|
||||
public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String stopWordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase",false);
|
||||
|
@ -44,20 +46,12 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
|
||||
if (stopWordFiles != null) {
|
||||
try {
|
||||
List<String> files = StrUtils.splitFileNames(stopWordFiles);
|
||||
if (stopWords == null && files.size() > 0){
|
||||
//default stopwords list has 35 or so words, but maybe don't make it that big to start
|
||||
stopWords = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
stopWords.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
|
||||
}
|
||||
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||
stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,7 +72,6 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
}
|
||||
|
||||
public StopFilter create(TokenStream input) {
|
||||
assureMatchVersion();
|
||||
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
|
||||
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
||||
return stopFilter;
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -50,7 +49,7 @@ public class SynonymFilter extends TokenFilter {
|
|||
public SynonymFilter(TokenStream in, SynonymMap map) {
|
||||
super(in);
|
||||
this.map = map;
|
||||
// just ensuring these exist attributes exist...
|
||||
// just ensuring these attributes exist...
|
||||
addAttribute(TermAttribute.class);
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
addAttribute(OffsetAttribute.class);
|
||||
|
@ -88,7 +87,7 @@ public class SynonymFilter extends TokenFilter {
|
|||
// common case fast-path of first token not matching anything
|
||||
AttributeSource firstTok = nextTok();
|
||||
if (firstTok == null) return false;
|
||||
TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class);
|
||||
TermAttribute termAtt = firstTok.addAttribute(TermAttribute.class);
|
||||
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
|
||||
if (result == null) {
|
||||
copy(this, firstTok);
|
||||
|
@ -121,7 +120,7 @@ public class SynonymFilter extends TokenFilter {
|
|||
boolean includeOrig = result.includeOrig();
|
||||
|
||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||
PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||
int repPos=0; // curr position in replacement token stream
|
||||
int pos=0; // current position in merged token stream
|
||||
|
@ -129,12 +128,11 @@ public class SynonymFilter extends TokenFilter {
|
|||
for (int i=0; i<result.synonyms.length; i++) {
|
||||
Token repTok = result.synonyms[i];
|
||||
AttributeSource newTok = firstTok.cloneAttributes();
|
||||
TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class);
|
||||
PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
|
||||
TermAttribute newTermAtt = newTok.addAttribute(TermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class);
|
||||
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||
|
||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
|
||||
|
@ -143,13 +141,13 @@ public class SynonymFilter extends TokenFilter {
|
|||
|
||||
// if necessary, insert original tokens and adjust position increment
|
||||
while (origTok != null && origPos <= repPos) {
|
||||
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
@ -161,13 +159,13 @@ public class SynonymFilter extends TokenFilter {
|
|||
|
||||
// finish up any leftover original tokens
|
||||
while (origTok!=null) {
|
||||
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
@ -217,7 +215,7 @@ public class SynonymFilter extends TokenFilter {
|
|||
if (tok == this)
|
||||
tok = cloneAttributes();
|
||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||
TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class);
|
||||
TermAttribute termAtt = tok.getAttribute(TermAttribute.class);
|
||||
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
|
||||
if (subMap != null) {
|
||||
|
@ -243,12 +241,8 @@ public class SynonymFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
private void copy(AttributeSource target, AttributeSource source) {
|
||||
if (target == source)
|
||||
return;
|
||||
for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
|
||||
sourceIt.hasNext();) {
|
||||
sourceIt.next().copyTo(targetIt.next());
|
||||
}
|
||||
if (target != source)
|
||||
source.copyTo(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
@ -136,7 +135,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||
List<String> tokList = new ArrayList<String>();
|
||||
try {
|
||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
||||
while (ts.incrementToken()){
|
||||
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
if( text.length() > 0 )
|
||||
|
|
|
@ -17,8 +17,9 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArrayMap;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.solr.util.CharArrayMap;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
@ -52,7 +53,9 @@ public class SynonymMap {
|
|||
SynonymMap currMap = this;
|
||||
for (String str : singleMatch) {
|
||||
if (currMap.submap==null) {
|
||||
currMap.submap = new CharArrayMap<SynonymMap>(1, ignoreCase());
|
||||
// for now hardcode at 2.9, as its what the old code did.
|
||||
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_29, 1, ignoreCase());
|
||||
}
|
||||
|
||||
SynonymMap map = currMap.submap.get(str);
|
||||
|
@ -68,7 +71,7 @@ public class SynonymMap {
|
|||
if (currMap.synonyms != null && !mergeExisting) {
|
||||
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||
}
|
||||
List superset = currMap.synonyms==null ? replacement :
|
||||
List<Token> superset = currMap.synonyms==null ? replacement :
|
||||
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||
currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]);
|
||||
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||
|
|
|
@ -18,15 +18,11 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.th.*;
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
import java.lang.Character.UnicodeBlock;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.th.ThaiWordFilter;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link ThaiWordFilter} */
|
||||
public class ThaiWordFilterFactory extends BaseTokenFilterFactory {
|
||||
public ThaiWordFilter create(TokenStream input) {
|
||||
return new ThaiWordFilter(input);
|
||||
|
|
|
@ -18,13 +18,11 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
||||
import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link TokenOffsetPayloadTokenFilter} */
|
||||
public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenOffsetPayloadTokenFilter create(TokenStream input) {
|
||||
return new TokenOffsetPayloadTokenFilter(input);
|
||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.CharReader;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
||||
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.NumericTokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
@ -41,8 +40,8 @@ public final class TrimFilter extends TokenFilter {
|
|||
super(in);
|
||||
this.updateOffsets = updateOffsets;
|
||||
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
this.termAtt = addAttribute(TermAttribute.class);
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||
|
||||
/** Factory for TurkishLowerCaseFilter */
|
||||
/** Factory for {@link TurkishLowerCaseFilter} */
|
||||
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TurkishLowerCaseFilter(input);
|
||||
|
|
|
@ -18,13 +18,11 @@
|
|||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.payloads.*;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
/** Factory for {@link TypeAsPayloadTokenFilter} */
|
||||
public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
public TypeAsPayloadTokenFilter create(TokenStream input) {
|
||||
return new TypeAsPayloadTokenFilter(input);
|
||||
|
|
|
@ -17,17 +17,22 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class WhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
||||
public WhitespaceTokenizer create(Reader input) {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public WhitespaceTokenizer create(Reader input) {
|
||||
return new WhitespaceTokenizer(luceneMatchVersion,input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -120,10 +120,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
*/
|
||||
final CharArraySet protWords;
|
||||
|
||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
|
||||
// used for iterating word delimiter breaks
|
||||
private final WordDelimiterIterator iterator;
|
||||
|
|
|
@ -21,12 +21,8 @@ import org.apache.lucene.analysis.CharArraySet;
|
|||
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
|
@ -40,21 +36,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
|||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
File protectedWordFiles = new File(wordFiles);
|
||||
if (protectedWordFiles.exists()) {
|
||||
List<String> wlist = loader.getLines(wordFiles);
|
||||
//This cast is safe in Lucene
|
||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
if (protectedWords == null)
|
||||
protectedWords = new CharArraySet(wlist, false);
|
||||
else
|
||||
protectedWords.addAll(wlist);
|
||||
}
|
||||
}
|
||||
protectedWords = getWordSet(loader, wordFiles, false);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -1,411 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* A simple class that stores key Strings as char[]'s in a
|
||||
* hash table. Note that this is not a general purpose
|
||||
* class. For example, it cannot remove items from the
|
||||
* map, nor does it resize its hash table to be smaller,
|
||||
* etc. It is designed to be quick to retrieve items
|
||||
* by char[] keys without the necessity of converting
|
||||
* to a String first.
|
||||
*/
|
||||
|
||||
public class CharArrayMap<V> extends AbstractMap<String, V>
|
||||
implements Map<String, V>, Cloneable, Serializable
|
||||
{
|
||||
private final static int INIT_SIZE = 2;
|
||||
private char[][] keys;
|
||||
private Object[] values;
|
||||
private int count;
|
||||
private final boolean ignoreCase;
|
||||
|
||||
/** Create map with enough capacity to hold startSize
|
||||
* terms */
|
||||
public CharArrayMap(int initialCapacity, boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
int size = INIT_SIZE;
|
||||
// load factor of .75, inverse is 1.25, or x+x/4
|
||||
initialCapacity = initialCapacity + (initialCapacity >>2);
|
||||
while(size <= initialCapacity)
|
||||
size <<= 1;
|
||||
keys = new char[size][];
|
||||
values = new Object[size];
|
||||
}
|
||||
|
||||
public boolean ignoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public V get(char[] key) {
|
||||
return get(key, 0, key.length);
|
||||
}
|
||||
|
||||
public V get(char[] key, int off, int len) {
|
||||
return (V)values[getSlot(key, off, len)];
|
||||
}
|
||||
|
||||
public V get(CharSequence key) {
|
||||
return (V)values[getSlot(key)];
|
||||
}
|
||||
|
||||
@Override
|
||||
public V get(Object key) {
|
||||
return (V)values[getSlot(key)];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsKey(Object s) {
|
||||
return keys[getSlot(s)] != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsValue(Object value) {
|
||||
if (value == null) {
|
||||
// search for key with a null value
|
||||
for (int i=0; i<keys.length; i++) {
|
||||
if (keys[i] != null && values[i] == null) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i=0; i<values.length; i++) {
|
||||
Object val = values[i];
|
||||
if (val != null && value.equals(val)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private int getSlot(Object key) {
|
||||
if (key instanceof char[]) {
|
||||
char[] keyc = (char[])key;
|
||||
return getSlot(keyc, 0, keyc.length);
|
||||
}
|
||||
return getSlot((CharSequence)key);
|
||||
}
|
||||
|
||||
private int getSlot(char[] key, int off, int len) {
|
||||
int code = getHashCode(key, len);
|
||||
int pos = code & (keys.length-1);
|
||||
char[] key2 = keys[pos];
|
||||
if (key2 != null && !equals(key, off, len, key2)) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & (keys.length-1);
|
||||
key2 = keys[pos];
|
||||
} while (key2 != null && !equals(key, off, len, key2));
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
/** Returns true if the String is in the set */
|
||||
private int getSlot(CharSequence key) {
|
||||
int code = getHashCode(key);
|
||||
int pos = code & (keys.length-1);
|
||||
char[] key2 = keys[pos];
|
||||
if (key2 != null && !equals(key, key2)) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & (keys.length-1);
|
||||
key2 = keys[pos];
|
||||
} while (key2 != null && !equals(key, key2));
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
public V put(CharSequence key, V val) {
|
||||
return put(key.toString(), val); // could be more efficient
|
||||
}
|
||||
|
||||
@Override
|
||||
public V put(String key, V val) {
|
||||
return put(key.toCharArray(), val);
|
||||
}
|
||||
|
||||
/** Add this key,val pair to the map.
|
||||
* The char[] key is directly used, no copy is made.
|
||||
* If ignoreCase is true for this Map, the key array will be directly modified.
|
||||
* The user should never modify the key after calling this method.
|
||||
*/
|
||||
public V put(char[] key, Object val) {
|
||||
if (ignoreCase)
|
||||
for(int i=0;i< key.length;i++)
|
||||
key[i] = Character.toLowerCase(key[i]);
|
||||
int slot = getSlot(key, 0, key.length);
|
||||
if (keys[slot] == null) count++;
|
||||
Object prev = values[slot];
|
||||
keys[slot] = key;
|
||||
values[slot] = val;
|
||||
|
||||
if (count + (count>>2) >= keys.length) {
|
||||
rehash();
|
||||
}
|
||||
|
||||
return (V)prev;
|
||||
}
|
||||
|
||||
|
||||
private boolean equals(char[] text1, int off, int len, char[] text2) {
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1[off+i]) != text2[i])
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (text1[off+i] != text2[i])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean equals(CharSequence text1, char[] text2) {
|
||||
int len = text1.length();
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (text1.charAt(i) != text2[i])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private void rehash() {
|
||||
final int newSize = 2* keys.length;
|
||||
char[][] oldEntries = keys;
|
||||
Object[] oldValues = values;
|
||||
keys = new char[newSize][];
|
||||
values = new Object[newSize];
|
||||
|
||||
for(int i=0;i<oldEntries.length;i++) {
|
||||
char[] key = oldEntries[i];
|
||||
if (key != null) {
|
||||
// todo: could be faster... no need to compare keys on collision
|
||||
// since they are unique
|
||||
int newSlot = getSlot(key,0,key.length);
|
||||
keys[newSlot] = key;
|
||||
values[newSlot] = oldValues[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int getHashCode(char[] text, int len) {
|
||||
int code = 0;
|
||||
if (ignoreCase) {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + Character.toLowerCase(text[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + text[i];
|
||||
}
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
private int getHashCode(CharSequence text) {
|
||||
int code;
|
||||
if (ignoreCase) {
|
||||
code = 0;
|
||||
int len = text.length();
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + Character.toLowerCase(text.charAt(i));
|
||||
}
|
||||
} else {
|
||||
if (false && text instanceof String) {
|
||||
code = text.hashCode();
|
||||
} else {
|
||||
code = 0;
|
||||
int len = text.length();
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + text.charAt(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return count==0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
count = 0;
|
||||
Arrays.fill(keys,null);
|
||||
Arrays.fill(values,null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Entry<String, V>> entrySet() {
|
||||
return new EntrySet();
|
||||
}
|
||||
|
||||
/** Returns an EntryIterator over this Map. */
|
||||
public EntryIterator iterator() {
|
||||
return new EntryIterator();
|
||||
}
|
||||
|
||||
/** public iterator class so efficient methods are exposed to users */
|
||||
public class EntryIterator implements Iterator<Map.Entry<String,V>> {
|
||||
int pos=-1;
|
||||
int lastPos;
|
||||
|
||||
EntryIterator() {
|
||||
goNext();
|
||||
}
|
||||
|
||||
private void goNext() {
|
||||
lastPos = pos;
|
||||
pos++;
|
||||
while (pos < keys.length && keys[pos] == null) pos++;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return pos < keys.length;
|
||||
}
|
||||
|
||||
/** gets the next key... do not modify the returned char[] */
|
||||
public char[] nextKey() {
|
||||
goNext();
|
||||
return keys[lastPos];
|
||||
}
|
||||
|
||||
/** gets the next key as a newly created String object */
|
||||
public String nextKeyString() {
|
||||
return new String(nextKey());
|
||||
}
|
||||
|
||||
/** returns the value associated with the last key returned */
|
||||
public V currentValue() {
|
||||
return (V)values[lastPos];
|
||||
}
|
||||
|
||||
/** sets the value associated with the last key returned */
|
||||
public V setValue(V value) {
|
||||
V old = (V)values[lastPos];
|
||||
values[lastPos] = value;
|
||||
return old;
|
||||
}
|
||||
|
||||
/** Returns an Entry<String,V> object created on the fly...
|
||||
* use nextCharArray() + currentValie() for better efficiency. */
|
||||
public Map.Entry<String,V> next() {
|
||||
goNext();
|
||||
return new MapEntry(lastPos);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private class MapEntry implements Map.Entry<String,V> {
|
||||
final int pos;
|
||||
|
||||
MapEntry(int pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public char[] getCharArr() {
|
||||
return keys[pos];
|
||||
}
|
||||
|
||||
public String getKey() {
|
||||
return new String(getCharArr());
|
||||
}
|
||||
|
||||
public V getValue() {
|
||||
return (V)values[pos];
|
||||
}
|
||||
|
||||
public V setValue(V value) {
|
||||
V old = (V)values[pos];
|
||||
values[pos] = value;
|
||||
return old;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getKey() + '=' + getValue();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private class EntrySet extends AbstractSet<Map.Entry<String, V>> {
|
||||
public EntryIterator iterator() {
|
||||
return new EntryIterator();
|
||||
}
|
||||
public boolean contains(Object o) {
|
||||
if (!(o instanceof Map.Entry))
|
||||
return false;
|
||||
Map.Entry e = (Map.Entry)o;
|
||||
Object key = e.getKey();
|
||||
if (key==null) return false; // we don't support null keys
|
||||
Object val = e.getValue();
|
||||
Object v = get(key);
|
||||
return v==null ? val==null : v.equals(val);
|
||||
}
|
||||
public boolean remove(Object o) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
public int size() {
|
||||
return count;
|
||||
}
|
||||
public void clear() {
|
||||
CharArrayMap.this.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
CharArrayMap<V> map = null;
|
||||
try {
|
||||
map = (CharArrayMap<V>)super.clone();
|
||||
map.keys = keys.clone();
|
||||
map.values = values.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
// impossible
|
||||
}
|
||||
return map;
|
||||
}
|
||||
}
|
|
@ -21,13 +21,18 @@ import java.util.Collections;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.core.Config;
|
||||
|
||||
/**
|
||||
* General token testing helper functions
|
||||
*/
|
||||
public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase
|
||||
{
|
||||
/** a map containing the default test version param for easy testing */
|
||||
protected static final Map<String,String> DEFAULT_VERSION_PARAM =
|
||||
Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT"));
|
||||
|
||||
/** The default test version for easy testing */
|
||||
public static final Version DEFAULT_VERSION = Config.parseLuceneVersionString(DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
|
||||
}
|
||||
|
|
|
@ -39,12 +39,12 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
|||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("words", "stop-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
Set<?> words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||
words.size() == 2);
|
||||
|
@ -71,13 +71,13 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
|||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
Set<?> words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue(words.contains("the"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
|
||||
|
|
|
@ -35,10 +35,10 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
|
||||
public void testReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
|
||||
TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
|
||||
TermAttribute term = cgf.addAttribute(TermAttribute.class);
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How", term.term());
|
||||
assertTrue(cgf.incrementToken());
|
||||
|
@ -56,11 +56,11 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
|
||||
public void testQueryReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
|
||||
TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
|
||||
TermAttribute term = wt.addAttribute(TermAttribute.class);
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("How_the", term.term());
|
||||
assertTrue(nsf.incrementToken());
|
||||
|
@ -88,7 +88,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
||||
new WhitespaceTokenizer(in), commonWords));
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -157,7 +157,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsFilter(
|
||||
new WhitespaceTokenizer(in), commonWords);
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -243,7 +243,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testCaseSensitive() throws Exception {
|
||||
final String input = "How The s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
||||
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
||||
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
||||
|
@ -256,7 +256,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testLastWordisStopWord() throws Exception {
|
||||
final String input = "dog the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
||||
|
@ -267,7 +267,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testFirstWordisStopWord() throws Exception {
|
||||
final String input = "the dog";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
||||
|
@ -278,7 +278,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testOneWordQueryStopWord() throws Exception {
|
||||
final String input = "the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the" });
|
||||
|
@ -289,7 +289,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testOneWordQuery() throws Exception {
|
||||
final String input = "monster";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "monster" });
|
||||
|
@ -300,7 +300,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void TestFirstAndLastStopWord() throws Exception {
|
||||
final String input = "the of";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
||||
|
|
|
@ -38,12 +38,12 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
|||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("words", "stop-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
Set<?> words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||
words.size() == 2);
|
||||
|
@ -70,13 +70,13 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
|||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
Set<?> words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue(words.contains("the"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "testing_the", "the_factory" });
|
||||
|
|
|
@ -29,7 +29,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
|||
public void testDefaults() throws Exception {
|
||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||
factory.init(new HashMap<String, String>());
|
||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
@ -43,7 +43,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
|||
parameters.put("maxCodeLength", "8");
|
||||
factory.init(parameters);
|
||||
|
||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
@ -56,10 +56,10 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
|||
public void testReset() throws Exception {
|
||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||
factory.init(new HashMap<String, String>());
|
||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
|
||||
TermAttribute termAtt = filteredStream.addAttribute(TermAttribute.class);
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
||||
assertTrue(filteredStream.incrementToken());
|
||||
|
|
|
@ -24,42 +24,42 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|||
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
|
||||
|
||||
public void testSize4FalseInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
||||
}
|
||||
|
||||
public void testSize4TrueInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
||||
}
|
||||
|
||||
public void testAlternateInjectFalse() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
||||
}
|
||||
|
||||
public void testSize8FalseInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
|
||||
// should have something after the stream
|
||||
stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
|
||||
stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
|
||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||
}
|
||||
|
|
|
@ -46,11 +46,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
|
@ -71,13 +71,13 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||
factory.init(args);
|
||||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
|
|
|
@ -33,7 +33,7 @@ public class LengthFilterTest extends BaseTokenTestCase {
|
|||
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||
factory.init(args);
|
||||
String test = "foo foobar super-duper-trooper";
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" });
|
||||
}
|
||||
}
|
|
@ -48,12 +48,12 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
}
|
||||
|
||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("language", "English");
|
||||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
|
@ -78,13 +78,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||
factory.init(args);
|
||||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
|
@ -116,13 +116,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
public void testProtected() throws Exception {
|
||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("protected", "protwords.txt");
|
||||
args.put("language", "English");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Reader reader = new StringReader("ridding of some stemming");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("Brasília");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "brasil" });
|
||||
|
|
|
@ -59,7 +59,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
|||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now Q B brown A cow B like Q B thing?";
|
||||
TokenStream ts = new AB_Q_Stream
|
||||
(new WhitespaceTokenizer(new StringReader(input)));
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
|
@ -67,15 +67,15 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
|||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now A A B brown A cow B like A A B thing?";
|
||||
TokenStream ts = new AB_AAB_Stream
|
||||
(new WhitespaceTokenizer(new StringReader(input)));
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
TokenStream ts = new AB_AAB_Stream(tokenizer);
|
||||
TermAttribute term = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||
TermAttribute term = ts.addAttribute(TermAttribute.class);
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("How", term.term());
|
||||
assertTrue(ts.incrementToken());
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestBulgarianStemFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("компютри");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "компютр" });
|
||||
|
|
|
@ -34,7 +34,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
|
||||
public void testCapitalization() throws Exception
|
||||
{
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" );
|
||||
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
|
||||
|
||||
|
@ -74,18 +74,18 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
|
||||
// now each token
|
||||
factory.onlyFirstWord = false;
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
||||
|
||||
// now only the long words
|
||||
factory.minWordLength = 3;
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
||||
|
||||
// without prefix
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Mckinley" });
|
||||
|
||||
|
@ -93,14 +93,14 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
factory = new CapitalizationFilterFactory();
|
||||
args.put( "okPrefix", "McK" ); // all words
|
||||
factory.init( args );
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "McKinley" });
|
||||
|
||||
// now try some stuff with numbers
|
||||
factory.forceFirstLetter = false;
|
||||
factory.onlyFirstWord = false;
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
|
||||
|
||||
|
@ -111,7 +111,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
}
|
||||
|
||||
public void testKeepIgnoreCase() throws Exception {
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put( CapitalizationFilterFactory.KEEP, "kitten" );
|
||||
args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" );
|
||||
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
|
||||
|
@ -141,12 +141,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
* This is very weird when combined with ONLY_FIRST_WORD!!!
|
||||
*/
|
||||
public void testMinWordLength() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
|
||||
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
|
||||
"helo testing"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
|
||||
|
@ -157,11 +157,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
* in each token (it should do nothing)
|
||||
*/
|
||||
public void testMaxWordCount() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
|
||||
"one two three four"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
|
||||
|
@ -171,7 +171,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
|
||||
*/
|
||||
public void testMaxWordCount2() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
|
@ -187,11 +187,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
* This is weird, it is not really a max, but inclusive (look at 'is')
|
||||
*/
|
||||
public void testMaxTokenLength() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
|
||||
"this is a test"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
|
||||
|
@ -201,12 +201,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
* Test CapitalizationFilterFactory's forceFirstLetter option
|
||||
*/
|
||||
public void testForceFirstLetter() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put(CapitalizationFilterFactory.KEEP, "kitten");
|
||||
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"Kitten"});
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestChineseFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testFiltering() throws Exception {
|
||||
Reader reader = new StringReader("this 1234 Is such a silly filter");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
ChineseFilterFactory factory = new ChineseFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
|
||||
|
|
|
@ -177,9 +177,9 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
|
|||
|
||||
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
|
||||
throws IOException {
|
||||
TermAttribute term1 = (TermAttribute) stream1
|
||||
TermAttribute term1 = stream1
|
||||
.addAttribute(TermAttribute.class);
|
||||
TermAttribute term2 = (TermAttribute) stream2
|
||||
TermAttribute term2 = stream2
|
||||
.addAttribute(TermAttribute.class);
|
||||
assertTrue(stream1.incrementToken());
|
||||
assertTrue(stream2.incrementToken());
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestCzechStemFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("angličtí");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
CzechStemFilterFactory factory = new CzechStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "anglick" });
|
||||
|
|
|
@ -21,8 +21,6 @@ import java.io.StringReader;
|
|||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||
|
@ -32,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
|||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
|
||||
public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
public void testEncoder() throws Exception {
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
@ -42,10 +40,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
|
|||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
factory.inform(loader);
|
||||
|
||||
TokenStream input = new WhitespaceTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1"));
|
||||
TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the|0.1 quick|0.1 red|0.1"));
|
||||
DelimitedPayloadTokenFilter tf = factory.create(input);
|
||||
while (tf.incrementToken()){
|
||||
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class);
|
||||
PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
|
||||
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
|
||||
byte[] payData = payAttr.getPayload().getData();
|
||||
assertTrue("payData is null and it shouldn't be", payData != null);
|
||||
|
@ -64,10 +62,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
|
|||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
factory.inform(loader);
|
||||
|
||||
TokenStream input = new WhitespaceTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"));
|
||||
TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the*0.1 quick*0.1 red*0.1"));
|
||||
DelimitedPayloadTokenFilter tf = factory.create(input);
|
||||
while (tf.incrementToken()){
|
||||
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class);
|
||||
PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
|
||||
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
|
||||
byte[] payData = payAttr.getPayload().getData();
|
||||
assertTrue("payData is null and it shouldn't be", payData != null);
|
||||
|
|
|
@ -37,10 +37,10 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestC
|
|||
*/
|
||||
public void testDecompounding() throws Exception {
|
||||
Reader reader = new StringReader("I like to play softball");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("dictionary", "compoundDictionary.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestDutchStemFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("lichamelijkheden");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
DutchStemFilterFactory factory = new DutchStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "licham" });
|
||||
|
|
|
@ -37,7 +37,7 @@ public class TestElisionFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testElision() throws Exception {
|
||||
Reader reader = new StringReader("l'avion");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
ElisionFilterFactory factory = new ElisionFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("habitable");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "habit" });
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestGermanStemFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("Tischen");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
GermanStemFilterFactory factory = new GermanStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "tisch" });
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
|
||||
|
|
|
@ -29,7 +29,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
|||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||
ts = factory.create(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
|||
public void testHyphenAtEnd() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||
ts = factory.create(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
|
|
|
@ -23,25 +23,22 @@ import java.util.Set;
|
|||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class TestKeepFilterFactory extends TestCase{
|
||||
public class TestKeepFilterFactory extends BaseTokenTestCase{
|
||||
|
||||
public void testInform() throws Exception {
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
KeepWordFilterFactory factory = new KeepWordFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("words", "keep-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getWords();
|
||||
Set<?> words = factory.getWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
|
|||
words.add( "bbb" );
|
||||
|
||||
String input = "aaa BBB ccc ddd EEE";
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
|
||||
// Test Stopwords
|
||||
|
@ -51,29 +51,29 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
|
|||
factory.inform( loader );
|
||||
factory.setWords( words );
|
||||
assertTrue(factory.isIgnoreCase());
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
// Test Stopwords (ignoreCase via the setter instead)
|
||||
factory = new KeepWordFilterFactory();
|
||||
args = new HashMap<String, String>();
|
||||
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
factory.init( args );
|
||||
factory.inform( loader );
|
||||
factory.setIgnoreCase(true);
|
||||
factory.setWords( words );
|
||||
assertTrue(factory.isIgnoreCase());
|
||||
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
||||
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
// Now force case
|
||||
factory = new KeepWordFilterFactory();
|
||||
args = new HashMap<String, String>();
|
||||
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
args.put( "ignoreCase", "false" );
|
||||
factory.init( args );
|
||||
factory.inform( loader );
|
||||
factory.setWords( words );
|
||||
assertFalse(factory.isIgnoreCase());
|
||||
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
||||
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the keyword marker filter factory is working.
|
||||
*/
|
||||
public class TestKeywordMarkerFilterFactory extends BaseTokenTestCase {
|
||||
public void testKeywords() throws IOException {
|
||||
Reader reader = new StringReader("dogs cats");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
args.put("protected", "protwords.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
|
||||
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
|
||||
}
|
||||
|
||||
public void testKeywordsCaseInsensitive() throws IOException {
|
||||
Reader reader = new StringReader("dogs cats Cats");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
args.put("protected", "protwords.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
|
||||
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
|
||||
}
|
||||
}
|
|
@ -20,7 +20,7 @@ public class TestMultiWordSynonyms extends BaseTokenTestCase {
|
|||
SynonymMap synMap = new SynonymMap(true);
|
||||
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
||||
|
||||
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
|
||||
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
|
||||
// This fails because ["e","e"] is the value of the token stream
|
||||
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue