mirror of https://github.com/apache/lucene.git
SOLR-1857: cleanup and sync analysis with Lucene trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@929782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a528a707c1
commit
3860c16a66
|
@ -126,6 +126,14 @@ New Features
|
||||||
|
|
||||||
* SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble)
|
* SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble)
|
||||||
|
|
||||||
|
* SOLR-1857: Synced Solr analysis with Lucene 3.1. Added KeywordMarkerFilterFactory
|
||||||
|
and StemmerOverrideFilterFactory, which can be used to tune stemming algorithms.
|
||||||
|
Added factories for Bulgarian, Czech, Hindi, and Turkish analysis. Improved the
|
||||||
|
performance of SnowballPorterFilterFactory. (rmuir)
|
||||||
|
|
||||||
|
* SOLR-1657: Converted remaining TokenStreams to the Attributes-based API. All Solr
|
||||||
|
TokenFilters now support custom Attributes, and some have improved performance:
|
||||||
|
especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -18,9 +18,10 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.ASCIIFoldingFilter;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.util.Map;
|
|
||||||
|
/** Factory for {@link ASCIIFoldingFilter} */
|
||||||
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
|
public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
|
||||||
public ASCIIFoldingFilter create(TokenStream input) {
|
public ASCIIFoldingFilter create(TokenStream input) {
|
||||||
return new ASCIIFoldingFilter(input);
|
return new ASCIIFoldingFilter(input);
|
||||||
|
|
|
@ -16,15 +16,13 @@ package org.apache.solr.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Factory for {@link ArabicLetterTokenizer}
|
||||||
*
|
|
||||||
**/
|
**/
|
||||||
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
|
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
|
||||||
|
|
||||||
|
|
|
@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Factory for {@link ArabicNormalizationFilter}
|
||||||
*
|
|
||||||
**/
|
**/
|
||||||
public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{
|
public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{
|
||||||
|
|
||||||
|
|
|
@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Factory for {@link ArabicStemFilter}
|
||||||
*
|
|
||||||
**/
|
**/
|
||||||
public class ArabicStemFilterFactory extends BaseTokenFilterFactory{
|
public class ArabicStemFilterFactory extends BaseTokenFilterFactory{
|
||||||
|
|
||||||
|
|
|
@ -17,13 +17,17 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
import org.apache.solr.core.Config;
|
import org.apache.solr.core.Config;
|
||||||
import org.apache.solr.common.SolrException;
|
|
||||||
import org.apache.solr.schema.IndexSchema;
|
import org.apache.solr.schema.IndexSchema;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,4 +98,22 @@ abstract class BaseTokenStreamFactory {
|
||||||
return Boolean.parseBoolean(s);
|
return Boolean.parseBoolean(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected CharArraySet getWordSet(ResourceLoader loader,
|
||||||
|
String wordFiles, boolean ignoreCase) throws IOException {
|
||||||
|
assureMatchVersion();
|
||||||
|
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||||
|
CharArraySet words = null;
|
||||||
|
if (files.size() > 0) {
|
||||||
|
// default stopwords list has 35 or so words, but maybe don't make it that
|
||||||
|
// big to start
|
||||||
|
words = new CharArraySet(luceneMatchVersion,
|
||||||
|
files.size() * 10, ignoreCase);
|
||||||
|
for (String file : files) {
|
||||||
|
List<String> wlist = loader.getLines(file.trim());
|
||||||
|
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
|
||||||
|
ignoreCase));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return words;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,15 +18,10 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.br.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.io.IOException;
|
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Hashtable;
|
/** Factory for {@link BrazilianStemFilter} */
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
|
||||||
public class BrazilianStemFilterFactory extends BaseTokenFilterFactory {
|
public class BrazilianStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public BrazilianStemFilter create(TokenStream in) {
|
public BrazilianStemFilter create(TokenStream in) {
|
||||||
return new BrazilianStemFilter(in);
|
return new BrazilianStemFilter(in);
|
||||||
|
|
|
@ -73,12 +73,12 @@ public abstract class BufferedTokenStream extends TokenFilter {
|
||||||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||||
|
|
||||||
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
public BufferedTokenStream(TokenStream input) {
|
public BufferedTokenStream(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
|
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
|
||||||
|
|
||||||
/** Factory for BulgarianStemFilter */
|
/** Factory for {@link BulgarianStemFilter} */
|
||||||
public class BulgarianStemFilterFactory extends BaseTokenFilterFactory {
|
public class BulgarianStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new BulgarianStemFilter(input);
|
return new BulgarianStemFilter(input);
|
||||||
|
|
|
@ -18,11 +18,11 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.cjk.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.cjk.CJKTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Map;
|
|
||||||
|
/** Factory for {@link CJKTokenizer} */
|
||||||
public class CJKTokenizerFactory extends BaseTokenizerFactory {
|
public class CJKTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public CJKTokenizer create(Reader in) {
|
public CJKTokenizer create(Reader in) {
|
||||||
return new CJKTokenizer(in);
|
return new CJKTokenizer(in);
|
||||||
|
|
|
@ -75,6 +75,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
@Override
|
@Override
|
||||||
public void init(Map<String, String> args) {
|
public void init(Map<String, String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
|
assureMatchVersion();
|
||||||
|
|
||||||
String k = args.get(KEEP);
|
String k = args.get(KEEP);
|
||||||
if (k != null) {
|
if (k != null) {
|
||||||
|
@ -84,7 +85,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
if ("true".equalsIgnoreCase(ignoreStr)) {
|
if ("true".equalsIgnoreCase(ignoreStr)) {
|
||||||
ignoreCase = true;
|
ignoreCase = true;
|
||||||
}
|
}
|
||||||
keep = new CharArraySet(10, ignoreCase);
|
keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
|
||||||
while (st.hasMoreTokens()) {
|
while (st.hasMoreTokens()) {
|
||||||
k = st.nextToken().trim();
|
k = st.nextToken().trim();
|
||||||
keep.add(k.toCharArray());
|
keep.add(k.toCharArray());
|
||||||
|
@ -194,7 +195,7 @@ class CapitalizationFilter extends TokenFilter {
|
||||||
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
||||||
super(in);
|
super(in);
|
||||||
this.factory = factory;
|
this.factory = factory;
|
||||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -18,10 +18,14 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.cn.*;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.util.Hashtable;
|
import org.apache.lucene.analysis.cn.ChineseFilter;
|
||||||
import org.apache.lucene.analysis.*;
|
|
||||||
import java.util.Map;
|
/**
|
||||||
|
* Factory for {@link ChineseFilter}
|
||||||
|
* @deprecated Use {@link StopFilterFactory} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class ChineseFilterFactory extends BaseTokenFilterFactory {
|
public class ChineseFilterFactory extends BaseTokenFilterFactory {
|
||||||
public ChineseFilter create(TokenStream in) {
|
public ChineseFilter create(TokenStream in) {
|
||||||
return new ChineseFilter(in);
|
return new ChineseFilter(in);
|
||||||
|
|
|
@ -18,10 +18,15 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.cn.*;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.cn.ChineseTokenizer;
|
||||||
import java.util.Map;
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link ChineseTokenizer}
|
||||||
|
* @deprecated Use {@link StandardTokenizerFactory} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class ChineseTokenizerFactory extends BaseTokenizerFactory {
|
public class ChineseTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public ChineseTokenizer create(Reader in) {
|
public ChineseTokenizer create(Reader in) {
|
||||||
return new ChineseTokenizer(in);
|
return new ChineseTokenizer(in);
|
||||||
|
|
|
@ -20,6 +20,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
|
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
|
||||||
|
@ -51,15 +52,25 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
|
|
||||||
private final StringBuilder buffer = new StringBuilder();
|
private final StringBuilder buffer = new StringBuilder();
|
||||||
|
|
||||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
private int lastStartOffset;
|
private int lastStartOffset;
|
||||||
private boolean lastWasCommon;
|
private boolean lastWasCommon;
|
||||||
private State savedState;
|
private State savedState;
|
||||||
|
|
||||||
|
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */
|
||||||
|
public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
|
||||||
|
this(Version.LUCENE_29, input, commonWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */
|
||||||
|
public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean ignoreCase) {
|
||||||
|
this(Version.LUCENE_29, input, commonWords, ignoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a token stream filtering the given input using a Set of common
|
* Construct a token stream filtering the given input using a Set of common
|
||||||
* words to create bigrams. Outputs both unigrams with position increment and
|
* words to create bigrams. Outputs both unigrams with position increment and
|
||||||
|
@ -69,8 +80,8 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
* @param input TokenStream input in filter chain
|
* @param input TokenStream input in filter chain
|
||||||
* @param commonWords The set of common words.
|
* @param commonWords The set of common words.
|
||||||
*/
|
*/
|
||||||
public CommonGramsFilter(TokenStream input, Set commonWords) {
|
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
|
||||||
this(input, commonWords, false);
|
this(matchVersion, input, commonWords, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -90,12 +101,12 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
* @param commonWords The set of common words.
|
* @param commonWords The set of common words.
|
||||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||||
*/
|
*/
|
||||||
public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) {
|
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
|
||||||
super(input);
|
super(input);
|
||||||
if (commonWords instanceof CharArraySet) {
|
if (commonWords instanceof CharArraySet) {
|
||||||
this.commonWords = (CharArraySet) commonWords;
|
this.commonWords = (CharArraySet) commonWords;
|
||||||
} else {
|
} else {
|
||||||
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
|
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
|
||||||
this.commonWords.addAll(commonWords);
|
this.commonWords.addAll(commonWords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -106,7 +117,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
*
|
*
|
||||||
* @param input Tokenstream in filter chain
|
* @param input Tokenstream in filter chain
|
||||||
* @param commonWords words to be used in constructing bigrams
|
* @param commonWords words to be used in constructing bigrams
|
||||||
|
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public CommonGramsFilter(TokenStream input, String[] commonWords) {
|
public CommonGramsFilter(TokenStream input, String[] commonWords) {
|
||||||
this(input, commonWords, false);
|
this(input, commonWords, false);
|
||||||
}
|
}
|
||||||
|
@ -118,7 +131,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
* @param input Tokenstream in filter chain
|
* @param input Tokenstream in filter chain
|
||||||
* @param commonWords words to be used in constructing bigrams
|
* @param commonWords words to be used in constructing bigrams
|
||||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||||
|
* @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead.
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
|
public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
|
||||||
super(input);
|
super(input);
|
||||||
this.commonWords = makeCommonSet(commonWords, ignoreCase);
|
this.commonWords = makeCommonSet(commonWords, ignoreCase);
|
||||||
|
@ -132,7 +147,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
* @param commonWords Array of common words which will be converted into the CharArraySet
|
* @param commonWords Array of common words which will be converted into the CharArraySet
|
||||||
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
|
* @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
|
||||||
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
|
* @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||||
|
* @deprecated create a CharArraySet with CharArraySet instead
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public static CharArraySet makeCommonSet(String[] commonWords) {
|
public static CharArraySet makeCommonSet(String[] commonWords) {
|
||||||
return makeCommonSet(commonWords, false);
|
return makeCommonSet(commonWords, false);
|
||||||
}
|
}
|
||||||
|
@ -145,7 +162,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
* @param commonWords Array of common words which will be converted into the CharArraySet
|
* @param commonWords Array of common words which will be converted into the CharArraySet
|
||||||
* @param ignoreCase If true, all words are lower cased first.
|
* @param ignoreCase If true, all words are lower cased first.
|
||||||
* @return a Set containing the words
|
* @return a Set containing the words
|
||||||
|
* @deprecated create a CharArraySet with CharArraySet instead
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
|
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
|
||||||
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
|
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
|
||||||
commonSet.addAll(Arrays.asList(commonWords));
|
commonSet.addAll(Arrays.asList(commonWords));
|
||||||
|
|
|
@ -17,14 +17,12 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -43,16 +41,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
||||||
|
|
||||||
if (commonWordFiles != null) {
|
if (commonWordFiles != null) {
|
||||||
try {
|
try {
|
||||||
List<String> files = StrUtils.splitFileNames(commonWordFiles);
|
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||||
if (commonWords == null && files.size() > 0){
|
|
||||||
//default stopwords list has 35 or so words, but maybe don't make it that big to start
|
|
||||||
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
|
|
||||||
}
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
|
||||||
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
@ -69,12 +58,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
||||||
return ignoreCase;
|
return ignoreCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set getCommonWords() {
|
public Set<?> getCommonWords() {
|
||||||
return commonWords;
|
return commonWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CommonGramsFilter create(TokenStream input) {
|
public CommonGramsFilter create(TokenStream input) {
|
||||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
|
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase);
|
||||||
return commonGrams;
|
return commonGrams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,8 +47,8 @@ import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
|
||||||
*/
|
*/
|
||||||
public final class CommonGramsQueryFilter extends TokenFilter {
|
public final class CommonGramsQueryFilter extends TokenFilter {
|
||||||
|
|
||||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
private State previous;
|
private State previous;
|
||||||
private String previousType;
|
private String previousType;
|
||||||
|
|
|
@ -17,14 +17,13 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -36,25 +35,19 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||||
implements ResourceLoaderAware {
|
implements ResourceLoaderAware {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String commonWordFiles = args.get("words");
|
String commonWordFiles = args.get("words");
|
||||||
ignoreCase = getBoolean("ignoreCase", false);
|
ignoreCase = getBoolean("ignoreCase", false);
|
||||||
|
|
||||||
if (commonWordFiles != null) {
|
if (commonWordFiles != null) {
|
||||||
try {
|
try {
|
||||||
List<String> files = StrUtils.splitFileNames(commonWordFiles);
|
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
|
||||||
if (commonWords == null && files.size() > 0) {
|
|
||||||
// default stopwords list has 35 or so words, but maybe don't make it
|
|
||||||
// that big to start
|
|
||||||
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
|
|
||||||
}
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
// TODO: once StopFilter.makeStopSet(List) method is available, switch
|
|
||||||
// to using that so we can avoid a toArray() call
|
|
||||||
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
|
|
||||||
.toArray(new String[0]), ignoreCase));
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
@ -73,7 +66,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||||
return ignoreCase;
|
return ignoreCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set getCommonWords() {
|
public Set<?> getCommonWords() {
|
||||||
return commonWords;
|
return commonWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,7 +74,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||||
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
||||||
*/
|
*/
|
||||||
public CommonGramsQueryFilter create(TokenStream input) {
|
public CommonGramsQueryFilter create(TokenStream input) {
|
||||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
|
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords,
|
||||||
ignoreCase);
|
ignoreCase);
|
||||||
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
||||||
commonGrams);
|
commonGrams);
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||||
|
|
||||||
/** Factory for CzechStemFilter */
|
/** Factory for {@link CzechStemFilter} */
|
||||||
public class CzechStemFilterFactory extends BaseTokenFilterFactory {
|
public class CzechStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new CzechStemFilter(input);
|
return new CzechStemFilter(input);
|
||||||
|
|
|
@ -31,7 +31,7 @@ import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*
|
* Factory for {@link DelimitedPayloadTokenFilter}
|
||||||
**/
|
**/
|
||||||
public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
public static final String ENCODER_ATTR = "encoder";
|
public static final String ENCODER_ATTR = "encoder";
|
||||||
|
|
|
@ -18,20 +18,18 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.compound.*;
|
import org.apache.lucene.analysis.compound.*;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/** Factory for {@link DictionaryCompoundWordTokenFilter} */
|
||||||
public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
private Set dictionary;
|
private CharArraySet dictionary;
|
||||||
private String dictFile;
|
private String dictFile;
|
||||||
private int minWordSize;
|
private int minWordSize;
|
||||||
private int minSubwordSize;
|
private int minSubwordSize;
|
||||||
|
@ -39,6 +37,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
|
||||||
private boolean onlyLongestMatch;
|
private boolean onlyLongestMatch;
|
||||||
public void init(Map<String, String> args) {
|
public void init(Map<String, String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
|
assureMatchVersion();
|
||||||
dictFile = args.get("dictionary");
|
dictFile = args.get("dictionary");
|
||||||
if (null == dictFile) {
|
if (null == dictFile) {
|
||||||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
|
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
|
||||||
|
@ -52,14 +51,13 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac
|
||||||
}
|
}
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
try {
|
try {
|
||||||
List<String> wlist = loader.getLines(dictFile);
|
dictionary = super.getWordSet(loader, dictFile, false);
|
||||||
dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public DictionaryCompoundWordTokenFilter create(TokenStream input) {
|
public DictionaryCompoundWordTokenFilter create(TokenStream input) {
|
||||||
return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
|
return new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,11 +20,9 @@ import java.io.IOException;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
|
||||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
public class DoubleMetaphoneFilter extends TokenFilter {
|
public class DoubleMetaphoneFilter extends TokenFilter {
|
||||||
|
@ -41,8 +39,8 @@ public class DoubleMetaphoneFilter extends TokenFilter {
|
||||||
super(input);
|
super(input);
|
||||||
this.encoder.setMaxCodeLen(maxCodeLength);
|
this.encoder.setMaxCodeLen(maxCodeLength);
|
||||||
this.inject = inject;
|
this.inject = inject;
|
||||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
this.posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -18,19 +18,19 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.nl.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
/**
|
||||||
import java.util.HashSet;
|
* @deprecated Use {@link SnowballPorterFilterFactory} with "Dutch" instead,
|
||||||
import java.util.Set;
|
* which has the same functionality.
|
||||||
import java.util.Map;
|
*/
|
||||||
import java.util.Map;
|
@Deprecated
|
||||||
public class DutchStemFilterFactory extends BaseTokenFilterFactory {
|
public class DutchStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public DutchStemFilter create(TokenStream _in) {
|
public TokenFilter create(TokenStream _in) {
|
||||||
return new DutchStemFilter(_in);
|
return new SnowballFilter(_in, new org.tartarus.snowball.ext.DutchStemmer());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,32 +21,22 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.fr.*;
|
import org.apache.lucene.analysis.fr.*;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
/** Factory for {@link ElisionFilter} */
|
||||||
public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
|
||||||
private Set articles;
|
private CharArraySet articles;
|
||||||
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String articlesFile = args.get("articles");
|
String articlesFile = args.get("articles");
|
||||||
|
|
||||||
if (articlesFile != null) {
|
if (articlesFile != null) {
|
||||||
try {
|
try {
|
||||||
List<String> wlist = loader.getLines(articlesFile);
|
articles = getWordSet(loader, articlesFile, false);
|
||||||
articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false);
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,17 +18,14 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.tartarus.snowball.SnowballProgram;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.File;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -42,21 +39,7 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
|
||||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||||
if (wordFiles != null) {
|
if (wordFiles != null) {
|
||||||
try {
|
try {
|
||||||
File protectedWordFiles = new File(wordFiles);
|
protectedWords = getWordSet(loader, wordFiles, false);
|
||||||
if (protectedWordFiles.exists()) {
|
|
||||||
List<String> wlist = loader.getLines(wordFiles);
|
|
||||||
//This cast is safe in Lucene
|
|
||||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
|
||||||
} else {
|
|
||||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
if (protectedWords == null)
|
|
||||||
protectedWords = new CharArraySet(wlist, false);
|
|
||||||
else
|
|
||||||
protectedWords.addAll(wlist);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
@ -65,20 +48,10 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
|
||||||
|
|
||||||
private CharArraySet protectedWords = null;
|
private CharArraySet protectedWords = null;
|
||||||
|
|
||||||
public EnglishPorterFilter create(TokenStream input) {
|
public TokenFilter create(TokenStream input) {
|
||||||
return new EnglishPorterFilter(input, protectedWords);
|
if (protectedWords != null)
|
||||||
|
input = new KeywordMarkerTokenFilter(input, protectedWords);
|
||||||
|
return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* English Porter2 filter that doesn't use reflection to
|
|
||||||
* adapt lucene to the snowball stemmer code.
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
class EnglishPorterFilter extends SnowballPorterFilter {
|
|
||||||
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
|
|
||||||
super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -18,18 +18,19 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.fr.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Hashtable;
|
/**
|
||||||
import java.util.HashSet;
|
* @deprecated Use {@link SnowballPorterFilterFactory} with "French" instead,
|
||||||
import java.util.Set;
|
* which has the same functionality.
|
||||||
import java.util.Map;
|
*/
|
||||||
|
@Deprecated
|
||||||
public class FrenchStemFilterFactory extends BaseTokenFilterFactory {
|
public class FrenchStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public FrenchStemFilter create(TokenStream in) {
|
public TokenFilter create(TokenStream in) {
|
||||||
return new FrenchStemFilter(in);
|
return new SnowballFilter(in, new org.tartarus.snowball.ext.FrenchStemmer());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,13 +18,11 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.de.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Set;
|
/** Factory for {@link GermanStemFilter} */
|
||||||
import java.util.Map;
|
|
||||||
public class GermanStemFilterFactory extends BaseTokenFilterFactory {
|
public class GermanStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public GermanStemFilter create(TokenStream in) {
|
public GermanStemFilter create(TokenStream in) {
|
||||||
return new GermanStemFilter(in);
|
return new GermanStemFilter(in);
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
|
||||||
|
/** Factory for {@link GreekLowerCaseFilter} */
|
||||||
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
||||||
|
|
||||||
/** Factory for HindiNormalizationFilter */
|
/** Factory for {@link HindiNormalizationFilter} */
|
||||||
public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory {
|
public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new HindiNormalizationFilter(input);
|
return new HindiNormalizationFilter(input);
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.hi.HindiStemFilter;
|
import org.apache.lucene.analysis.hi.HindiStemFilter;
|
||||||
|
|
||||||
/** Factory for HindiStemFilter */
|
/** Factory for {@link HindiStemFilter} */
|
||||||
public class HindiStemFilterFactory extends BaseTokenFilterFactory {
|
public class HindiStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new HindiStemFilter(input);
|
return new HindiStemFilter(input);
|
||||||
|
|
|
@ -54,8 +54,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
*/
|
*/
|
||||||
public final class HyphenatedWordsFilter extends TokenFilter {
|
public final class HyphenatedWordsFilter extends TokenFilter {
|
||||||
|
|
||||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
private final StringBuilder hyphenated = new StringBuilder();
|
private final StringBuilder hyphenated = new StringBuilder();
|
||||||
private State savedState;
|
private State savedState;
|
||||||
|
|
|
@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.solr.analysis.BaseTokenFilterFactory;
|
import org.apache.solr.analysis.BaseTokenFilterFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for HyphenatedWordsFilter
|
* Factory for {@link HyphenatedWordsFilter}
|
||||||
*/
|
*/
|
||||||
public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
|
public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
|
||||||
public HyphenatedWordsFilter create(TokenStream input) {
|
public HyphenatedWordsFilter create(TokenStream input) {
|
||||||
|
|
|
@ -21,8 +21,10 @@ import org.apache.lucene.analysis.ISOLatin1AccentFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
/** Factory for ISOLatin1AccentFilter
|
/** Factory for ISOLatin1AccentFilter
|
||||||
|
* @deprecated Use {@link ASCIIFoldingFilterFactory} instead.
|
||||||
* $Id$
|
* $Id$
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory {
|
public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory {
|
||||||
public ISOLatin1AccentFilter create(TokenStream input) {
|
public ISOLatin1AccentFilter create(TokenStream input) {
|
||||||
return new ISOLatin1AccentFilter(input);
|
return new ISOLatin1AccentFilter(input);
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||||
|
|
||||||
/** Factory for IndicNormalizationFilter */
|
/** Factory for {@link IndicNormalizationFilter} */
|
||||||
public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory {
|
public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new IndicNormalizationFilter(input);
|
return new IndicNormalizationFilter(input);
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.in.IndicTokenizer;
|
import org.apache.lucene.analysis.in.IndicTokenizer;
|
||||||
|
|
||||||
/** Factory for IndicTokenizer */
|
/** Factory for {@link IndicTokenizer} */
|
||||||
public class IndicTokenizerFactory extends BaseTokenizerFactory {
|
public class IndicTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public Tokenizer create(Reader input) {
|
public Tokenizer create(Reader input) {
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
|
|
|
@ -19,10 +19,8 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.tartarus.snowball.SnowballProgram;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -38,6 +36,8 @@ public final class KeepWordFilter extends TokenFilter {
|
||||||
private final CharArraySet words;
|
private final CharArraySet words;
|
||||||
private final TermAttribute termAtt;
|
private final TermAttribute termAtt;
|
||||||
|
|
||||||
|
/** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
|
||||||
|
@Deprecated
|
||||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||||
this(in, new CharArraySet(words, ignoreCase));
|
this(in, new CharArraySet(words, ignoreCase));
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ public final class KeepWordFilter extends TokenFilter {
|
||||||
public KeepWordFilter(TokenStream in, CharArraySet words) {
|
public KeepWordFilter(TokenStream in, CharArraySet words) {
|
||||||
super(in);
|
super(in);
|
||||||
this.words = words;
|
this.words = words;
|
||||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -18,17 +18,11 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.io.File;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -40,23 +34,13 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
||||||
private CharArraySet words;
|
private CharArraySet words;
|
||||||
private boolean ignoreCase;
|
private boolean ignoreCase;
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String wordFiles = args.get("words");
|
String wordFiles = args.get("words");
|
||||||
ignoreCase = getBoolean("ignoreCase", false);
|
ignoreCase = getBoolean("ignoreCase", false);
|
||||||
if (wordFiles != null) {
|
if (wordFiles != null) {
|
||||||
try {
|
try {
|
||||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
words = getWordSet(loader, wordFiles, ignoreCase);
|
||||||
if (words == null && files.size() > 0){
|
} catch (IOException e) {
|
||||||
words = new CharArraySet(files.size() * 10, ignoreCase);
|
|
||||||
}
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
|
||||||
words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -67,14 +51,14 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
||||||
* NOTE: if ignoreCase==true, the words are expected to be lowercase
|
* NOTE: if ignoreCase==true, the words are expected to be lowercase
|
||||||
*/
|
*/
|
||||||
public void setWords(Set<String> words) {
|
public void setWords(Set<String> words) {
|
||||||
this.words = new CharArraySet(words, ignoreCase);
|
this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setIgnoreCase(boolean ignoreCase) {
|
public void setIgnoreCase(boolean ignoreCase) {
|
||||||
this.ignoreCase = ignoreCase;
|
if (words != null && this.ignoreCase != ignoreCase) {
|
||||||
if (words != null) {
|
words = new CharArraySet(luceneMatchVersion, words, ignoreCase);
|
||||||
words = new CharArraySet(words, ignoreCase);
|
|
||||||
}
|
}
|
||||||
|
this.ignoreCase = ignoreCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public KeepWordFilter create(TokenStream input) {
|
public KeepWordFilter create(TokenStream input) {
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link KeywordMarkerTokenFilter}
|
||||||
|
*/
|
||||||
|
public class KeywordMarkerFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
public static final String PROTECTED_TOKENS = "protected";
|
||||||
|
private CharArraySet protectedWords;
|
||||||
|
private boolean ignoreCase;
|
||||||
|
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||||
|
ignoreCase = getBoolean("ignoreCase", false);
|
||||||
|
if (wordFiles != null) {
|
||||||
|
try {
|
||||||
|
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isIgnoreCase() {
|
||||||
|
return ignoreCase;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return protectedWords == null ? input : new KeywordMarkerTokenFilter(input, protectedWords);
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,7 +17,6 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
|
@ -17,17 +17,23 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.LetterTokenizer;
|
import org.apache.lucene.analysis.LetterTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class LetterTokenizerFactory extends BaseTokenizerFactory {
|
public class LetterTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public LetterTokenizer create(Reader input) {
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public LetterTokenizer create(Reader input) {
|
||||||
return new LetterTokenizer(luceneMatchVersion, input);
|
return new LetterTokenizer(luceneMatchVersion, input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
|
||||||
|
@ -24,8 +26,13 @@ import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
|
public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||||
public LowerCaseFilter create(TokenStream input) {
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public LowerCaseFilter create(TokenStream input) {
|
||||||
return new LowerCaseFilter(luceneMatchVersion,input);
|
return new LowerCaseFilter(luceneMatchVersion,input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,17 +17,22 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
|
public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public LowerCaseTokenizer create(Reader input) {
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public LowerCaseTokenizer create(Reader input) {
|
||||||
return new LowerCaseTokenizer(luceneMatchVersion,input);
|
return new LowerCaseTokenizer(luceneMatchVersion,input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,13 +18,12 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.payloads.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.Payload;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
/** Factory for {@link NumericPayloadTokenFilter} */
|
||||||
public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||||
private float payload;
|
private float payload;
|
||||||
private String typeMatch;
|
private String typeMatch;
|
||||||
|
|
|
@ -19,13 +19,10 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.Set;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
|
|
||||||
|
@ -66,7 +63,7 @@ public final class PatternReplaceFilter extends TokenFilter {
|
||||||
this.p=p;
|
this.p=p;
|
||||||
this.replacement = (null == replacement) ? "" : replacement;
|
this.replacement = (null == replacement) ? "" : replacement;
|
||||||
this.all=all;
|
this.all=all;
|
||||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -56,8 +56,8 @@ import org.apache.commons.io.IOUtils;
|
||||||
*/
|
*/
|
||||||
public final class PatternTokenizer extends Tokenizer {
|
public final class PatternTokenizer extends Tokenizer {
|
||||||
|
|
||||||
private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
private String str;
|
private String str;
|
||||||
private int index;
|
private int index;
|
||||||
|
|
|
@ -18,12 +18,11 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.fa.*;
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import java.util.Map;
|
/** Factory for {@link PersianNormalizationFilter} */
|
||||||
public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory {
|
public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
public PersianNormalizationFilter create(TokenStream input) {
|
public PersianNormalizationFilter create(TokenStream input) {
|
||||||
return new PersianNormalizationFilter(input);
|
return new PersianNormalizationFilter(input);
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.commons.codec.Encoder;
|
import org.apache.commons.codec.Encoder;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
|
@ -47,8 +46,8 @@ public class PhoneticFilter extends TokenFilter
|
||||||
this.encoder = encoder;
|
this.encoder = encoder;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.inject = inject;
|
this.inject = inject;
|
||||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
this.posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,11 +17,12 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.solr.util.CharArrayMap;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -30,12 +31,11 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
// keep a seen 'set' after each term with posInc > 0
|
// use a fixed version, as we don't care about case sensitivity.
|
||||||
// for now use CharArrayMap vs CharArraySet, as it has clear()
|
private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
|
||||||
private final CharArrayMap<Boolean> previous = new CharArrayMap<Boolean>(8, false);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new RemoveDuplicatesTokenFilter
|
* Creates a new RemoveDuplicatesTokenFilter
|
||||||
|
@ -60,12 +60,12 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
||||||
previous.clear();
|
previous.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null);
|
boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length));
|
||||||
|
|
||||||
// clone the term, and add to the set of seen terms.
|
// clone the term, and add to the set of seen terms.
|
||||||
char saved[] = new char[length];
|
char saved[] = new char[length];
|
||||||
System.arraycopy(term, 0, saved, 0, length);
|
System.arraycopy(term, 0, saved, 0, length);
|
||||||
previous.put(saved, Boolean.TRUE);
|
previous.add(saved);
|
||||||
|
|
||||||
if (!duplicate) {
|
if (!duplicate) {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -45,8 +45,8 @@ public class ReversedWildcardFilter extends TokenFilter {
|
||||||
|
|
||||||
protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
|
protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
|
||||||
super(input);
|
super(input);
|
||||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
this.posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
this.withOriginal = withOriginal;
|
this.withOriginal = withOriginal;
|
||||||
this.markerChar = markerChar;
|
this.markerChar = markerChar;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,61 +0,0 @@
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
//package org.apache.solr.analysis;
|
|
||||||
//import org.apache.lucene.analysis.ru.*;
|
|
||||||
//import java.util.Map;
|
|
||||||
//import java.util.HashMap;
|
|
||||||
//import org.apache.solr.core.SolrConfig;
|
|
||||||
//import org.apache.solr.common.SolrException;
|
|
||||||
//import org.apache.solr.common.SolrException.ErrorCode;
|
|
||||||
//import org.slf4j.Logger;
|
|
||||||
//import org.slf4j.LoggerFactory;
|
|
||||||
//
|
|
||||||
//@Deprecated
|
|
||||||
//public class RussianCommon {
|
|
||||||
//
|
|
||||||
// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class);
|
|
||||||
//
|
|
||||||
// private static Map<String,char[]> CHARSETS = new HashMap<String,char[]>();
|
|
||||||
// static {
|
|
||||||
// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian);
|
|
||||||
// CHARSETS.put("KOI8",RussianCharsets.KOI8);
|
|
||||||
// CHARSETS.put("CP1251",RussianCharsets.CP1251);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// public static char[] getCharset(String name) {
|
|
||||||
// if (null == name)
|
|
||||||
// return RussianCharsets.UnicodeRussian;
|
|
||||||
//
|
|
||||||
// char[] charset = CHARSETS.get(name);
|
|
||||||
//
|
|
||||||
// if (charset.equals(RussianCharsets.UnicodeRussian))
|
|
||||||
// logger.warn("Specifying UnicodeRussian is no longer required (default). "
|
|
||||||
// + "Use of the charset parameter will cause an error in Solr 1.5");
|
|
||||||
// else
|
|
||||||
// logger.warn("Support for this custom encoding is deprecated. "
|
|
||||||
// + "Use of the charset parameter will cause an error in Solr 1.5");
|
|
||||||
//
|
|
||||||
// if (null == charset) {
|
|
||||||
// throw new SolrException(ErrorCode.SERVER_ERROR,
|
|
||||||
// "Don't understand charset: " + name);
|
|
||||||
// }
|
|
||||||
// return charset;
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
|
@ -24,6 +24,10 @@ import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
|
||||||
|
/** @deprecated Use {@link StandardTokenizerFactory} instead.
|
||||||
|
* This tokenizer has no Russian-specific functionality.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
|
public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -19,11 +19,17 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.ru.RussianLowerCaseFilter;
|
import org.apache.lucene.util.Version;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
|
||||||
|
/** @deprecated Use {@link LowerCaseFilterFactory} instead which has the
|
||||||
|
* same functionality.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -35,8 +41,9 @@ public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||||
+ "Please process your documents as Unicode instead.");
|
+ "Please process your documents as Unicode instead.");
|
||||||
}
|
}
|
||||||
|
|
||||||
public RussianLowerCaseFilter create(TokenStream in) {
|
public TokenFilter create(TokenStream in) {
|
||||||
return new RussianLowerCaseFilter(in);
|
// hardcode the version to give exactly the old behavior
|
||||||
|
return new LowerCaseFilter(Version.LUCENE_29, in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,16 +19,19 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.util.Map;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.ru.RussianStemFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use {@link SnowballPorterFilterFactory} with "Russian" instead,
|
||||||
|
* which has the same functionality.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
|
public class RussianStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
|
||||||
|
public TokenFilter create(TokenStream in) {
|
||||||
public RussianStemFilter create(TokenStream in) {
|
return new SnowballFilter(in, new org.tartarus.snowball.ext.RussianStemmer());
|
||||||
return new RussianStemFilter(in);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,14 +18,12 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.shingle.*;
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
/** Factory for {@link ShingleFilter} */
|
||||||
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
public class ShingleFilterFactory extends BaseTokenFilterFactory {
|
||||||
private int maxShingleSize;
|
private int maxShingleSize;
|
||||||
private boolean outputUnigrams;
|
private boolean outputUnigrams;
|
||||||
|
|
|
@ -17,26 +17,21 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.List;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.tartarus.snowball.SnowballProgram;
|
import org.tartarus.snowball.SnowballProgram;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for SnowballFilters, with configurable language
|
* Factory for {@link SnowballFilter}, with configurable language
|
||||||
*
|
* <p>
|
||||||
* Browsing the code, SnowballFilter uses reflection to adapt to Lucene... don't
|
* Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection.
|
||||||
* use this if you are concerned about speed. Use EnglishPorterFilterFactory.
|
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
@ -44,28 +39,14 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
|
||||||
public static final String PROTECTED_TOKENS = "protected";
|
public static final String PROTECTED_TOKENS = "protected";
|
||||||
|
|
||||||
private String language = "English";
|
private String language = "English";
|
||||||
private Class stemClass;
|
private Class<?> stemClass;
|
||||||
|
|
||||||
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||||
if (wordFiles != null) {
|
if (wordFiles != null) {
|
||||||
try {
|
try {
|
||||||
File protectedWordFiles = new File(wordFiles);
|
protectedWords = getWordSet(loader, wordFiles, false);
|
||||||
if (protectedWordFiles.exists()) {
|
|
||||||
List<String> wlist = loader.getLines(wordFiles);
|
|
||||||
//This cast is safe in Lucene
|
|
||||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
|
||||||
} else {
|
|
||||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
if (protectedWords == null)
|
|
||||||
protectedWords = new CharArraySet(wlist, false);
|
|
||||||
else
|
|
||||||
protectedWords.addAll(wlist);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
@ -87,50 +68,17 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public SnowballPorterFilter create(TokenStream input) {
|
public TokenFilter create(TokenStream input) {
|
||||||
SnowballProgram program;
|
SnowballProgram program;
|
||||||
try {
|
try {
|
||||||
program = (SnowballProgram)stemClass.newInstance();
|
program = (SnowballProgram)stemClass.newInstance();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
|
throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e);
|
||||||
}
|
}
|
||||||
return new SnowballPorterFilter(input, program, protectedWords);
|
|
||||||
|
if (protectedWords != null)
|
||||||
|
input = new KeywordMarkerTokenFilter(input, protectedWords);
|
||||||
|
return new SnowballFilter(input, program);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class SnowballPorterFilter extends TokenFilter {
|
|
||||||
private final CharArraySet protWords;
|
|
||||||
private final SnowballProgram stemmer;
|
|
||||||
private final TermAttribute termAtt;
|
|
||||||
|
|
||||||
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
|
|
||||||
super(source);
|
|
||||||
this.protWords = protWords;
|
|
||||||
this.stemmer = stemmer;
|
|
||||||
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean incrementToken() throws IOException {
|
|
||||||
if (!input.incrementToken()) return false;
|
|
||||||
|
|
||||||
char[] termBuffer = termAtt.termBuffer();
|
|
||||||
int len = termAtt.termLength();
|
|
||||||
// if protected, don't stem. use this to avoid stemming collisions.
|
|
||||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
stemmer.setCurrent(termBuffer, len);
|
|
||||||
stemmer.stem();
|
|
||||||
final char finalTerm[] = stemmer.getCurrentBuffer();
|
|
||||||
final int newLength = stemmer.getCurrentBufferLength();
|
|
||||||
if (finalTerm != termBuffer)
|
|
||||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
|
||||||
else
|
|
||||||
termAtt.setTermLength(newLength);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -17,18 +17,23 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class StandardTokenizerFactory extends BaseTokenizerFactory {
|
public class StandardTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public StandardTokenizer create(Reader input) {
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public StandardTokenizer create(Reader input) {
|
||||||
return new StandardTokenizer(luceneMatchVersion, input);
|
return new StandardTokenizer(luceneMatchVersion, input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArrayMap;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link StemmerOverrideFilter}
|
||||||
|
*/
|
||||||
|
public class StemmerOverrideFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
private CharArrayMap<String> dictionary = null;
|
||||||
|
private boolean ignoreCase;
|
||||||
|
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
String dictionaryFiles = args.get("dictionary");
|
||||||
|
ignoreCase = getBoolean("ignoreCase", false);
|
||||||
|
if (dictionaryFiles != null) {
|
||||||
|
assureMatchVersion();
|
||||||
|
List<String> files = StrUtils.splitFileNames(dictionaryFiles);
|
||||||
|
try {
|
||||||
|
if (files.size() > 0) {
|
||||||
|
dictionary = new CharArrayMap<String>(luceneMatchVersion,
|
||||||
|
files.size() * 10, ignoreCase);
|
||||||
|
for (String file : files) {
|
||||||
|
List<String> list = loader.getLines(file.trim());
|
||||||
|
for (String line : list) {
|
||||||
|
String[] mapping = line.split("\t", 2);
|
||||||
|
dictionary.put(mapping[0], mapping[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isIgnoreCase() {
|
||||||
|
return ignoreCase;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary);
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,18 +18,14 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.Map;
|
||||||
import java.util.List;
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -37,6 +33,12 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String stopWordFiles = args.get("words");
|
String stopWordFiles = args.get("words");
|
||||||
ignoreCase = getBoolean("ignoreCase",false);
|
ignoreCase = getBoolean("ignoreCase",false);
|
||||||
|
@ -44,20 +46,12 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
||||||
|
|
||||||
if (stopWordFiles != null) {
|
if (stopWordFiles != null) {
|
||||||
try {
|
try {
|
||||||
List<String> files = StrUtils.splitFileNames(stopWordFiles);
|
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
|
||||||
if (stopWords == null && files.size() > 0){
|
|
||||||
//default stopwords list has 35 or so words, but maybe don't make it that big to start
|
|
||||||
stopWords = new CharArraySet(files.size() * 10, ignoreCase);
|
|
||||||
}
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
stopWords.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +72,6 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
||||||
}
|
}
|
||||||
|
|
||||||
public StopFilter create(TokenStream input) {
|
public StopFilter create(TokenStream input) {
|
||||||
assureMatchVersion();
|
|
||||||
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
|
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
|
||||||
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
||||||
return stopFilter;
|
return stopFilter;
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -50,7 +49,7 @@ public class SynonymFilter extends TokenFilter {
|
||||||
public SynonymFilter(TokenStream in, SynonymMap map) {
|
public SynonymFilter(TokenStream in, SynonymMap map) {
|
||||||
super(in);
|
super(in);
|
||||||
this.map = map;
|
this.map = map;
|
||||||
// just ensuring these exist attributes exist...
|
// just ensuring these attributes exist...
|
||||||
addAttribute(TermAttribute.class);
|
addAttribute(TermAttribute.class);
|
||||||
addAttribute(PositionIncrementAttribute.class);
|
addAttribute(PositionIncrementAttribute.class);
|
||||||
addAttribute(OffsetAttribute.class);
|
addAttribute(OffsetAttribute.class);
|
||||||
|
@ -88,7 +87,7 @@ public class SynonymFilter extends TokenFilter {
|
||||||
// common case fast-path of first token not matching anything
|
// common case fast-path of first token not matching anything
|
||||||
AttributeSource firstTok = nextTok();
|
AttributeSource firstTok = nextTok();
|
||||||
if (firstTok == null) return false;
|
if (firstTok == null) return false;
|
||||||
TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class);
|
TermAttribute termAtt = firstTok.addAttribute(TermAttribute.class);
|
||||||
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
|
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null;
|
||||||
if (result == null) {
|
if (result == null) {
|
||||||
copy(this, firstTok);
|
copy(this, firstTok);
|
||||||
|
@ -121,7 +120,7 @@ public class SynonymFilter extends TokenFilter {
|
||||||
boolean includeOrig = result.includeOrig();
|
boolean includeOrig = result.includeOrig();
|
||||||
|
|
||||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||||
PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||||
int repPos=0; // curr position in replacement token stream
|
int repPos=0; // curr position in replacement token stream
|
||||||
int pos=0; // current position in merged token stream
|
int pos=0; // current position in merged token stream
|
||||||
|
@ -129,12 +128,11 @@ public class SynonymFilter extends TokenFilter {
|
||||||
for (int i=0; i<result.synonyms.length; i++) {
|
for (int i=0; i<result.synonyms.length; i++) {
|
||||||
Token repTok = result.synonyms[i];
|
Token repTok = result.synonyms[i];
|
||||||
AttributeSource newTok = firstTok.cloneAttributes();
|
AttributeSource newTok = firstTok.cloneAttributes();
|
||||||
TermAttribute newTermAtt = (TermAttribute) newTok.addAttribute(TermAttribute.class);
|
TermAttribute newTermAtt = newTok.addAttribute(TermAttribute.class);
|
||||||
OffsetAttribute newOffsetAtt = (OffsetAttribute) newTok.addAttribute(OffsetAttribute.class);
|
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||||
TypeAttribute newTypeAtt = (TypeAttribute) newTok.addAttribute(TypeAttribute.class);
|
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
PositionIncrementAttribute newPosIncAtt = (PositionIncrementAttribute) newTok.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
|
|
||||||
OffsetAttribute lastOffsetAtt = (OffsetAttribute) lastTok.addAttribute(OffsetAttribute.class);
|
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||||
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
|
newTermAtt.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
|
||||||
|
@ -143,13 +141,13 @@ public class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
// if necessary, insert original tokens and adjust position increment
|
// if necessary, insert original tokens and adjust position increment
|
||||||
while (origTok != null && origPos <= repPos) {
|
while (origTok != null && origPos <= repPos) {
|
||||||
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
origPosInc.setPositionIncrement(origPos-pos);
|
origPosInc.setPositionIncrement(origPos-pos);
|
||||||
generated.add(origTok);
|
generated.add(origTok);
|
||||||
pos += origPosInc.getPositionIncrement();
|
pos += origPosInc.getPositionIncrement();
|
||||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||||
if (origTok != null) {
|
if (origTok != null) {
|
||||||
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
origPos += origPosInc.getPositionIncrement();
|
origPos += origPosInc.getPositionIncrement();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -161,13 +159,13 @@ public class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
// finish up any leftover original tokens
|
// finish up any leftover original tokens
|
||||||
while (origTok!=null) {
|
while (origTok!=null) {
|
||||||
PositionIncrementAttribute origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
origPosInc.setPositionIncrement(origPos-pos);
|
origPosInc.setPositionIncrement(origPos-pos);
|
||||||
generated.add(origTok);
|
generated.add(origTok);
|
||||||
pos += origPosInc.getPositionIncrement();
|
pos += origPosInc.getPositionIncrement();
|
||||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||||
if (origTok != null) {
|
if (origTok != null) {
|
||||||
origPosInc = (PositionIncrementAttribute) origTok.addAttribute(PositionIncrementAttribute.class);
|
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
origPos += origPosInc.getPositionIncrement();
|
origPos += origPosInc.getPositionIncrement();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -217,7 +215,7 @@ public class SynonymFilter extends TokenFilter {
|
||||||
if (tok == this)
|
if (tok == this)
|
||||||
tok = cloneAttributes();
|
tok = cloneAttributes();
|
||||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||||
TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = tok.getAttribute(TermAttribute.class);
|
||||||
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||||
|
|
||||||
if (subMap != null) {
|
if (subMap != null) {
|
||||||
|
@ -243,12 +241,8 @@ public class SynonymFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void copy(AttributeSource target, AttributeSource source) {
|
private void copy(AttributeSource target, AttributeSource source) {
|
||||||
if (target == source)
|
if (target != source)
|
||||||
return;
|
source.copyTo(target);
|
||||||
for (Iterator<AttributeImpl> sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator();
|
|
||||||
sourceIt.hasNext();) {
|
|
||||||
sourceIt.next().copyTo(targetIt.next());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
@ -136,7 +135,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
||||||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||||
List<String> tokList = new ArrayList<String>();
|
List<String> tokList = new ArrayList<String>();
|
||||||
try {
|
try {
|
||||||
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
||||||
while (ts.incrementToken()){
|
while (ts.incrementToken()){
|
||||||
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
|
String text = new String(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||||
if( text.length() > 0 )
|
if( text.length() > 0 )
|
||||||
|
|
|
@ -17,8 +17,9 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.solr.util.CharArrayMap;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
@ -52,7 +53,9 @@ public class SynonymMap {
|
||||||
SynonymMap currMap = this;
|
SynonymMap currMap = this;
|
||||||
for (String str : singleMatch) {
|
for (String str : singleMatch) {
|
||||||
if (currMap.submap==null) {
|
if (currMap.submap==null) {
|
||||||
currMap.submap = new CharArrayMap<SynonymMap>(1, ignoreCase());
|
// for now hardcode at 2.9, as its what the old code did.
|
||||||
|
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||||
|
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_29, 1, ignoreCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
SynonymMap map = currMap.submap.get(str);
|
SynonymMap map = currMap.submap.get(str);
|
||||||
|
@ -68,7 +71,7 @@ public class SynonymMap {
|
||||||
if (currMap.synonyms != null && !mergeExisting) {
|
if (currMap.synonyms != null && !mergeExisting) {
|
||||||
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||||
}
|
}
|
||||||
List superset = currMap.synonyms==null ? replacement :
|
List<Token> superset = currMap.synonyms==null ? replacement :
|
||||||
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||||
currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]);
|
currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]);
|
||||||
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||||
|
|
|
@ -18,15 +18,11 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.th.*;
|
import org.apache.lucene.analysis.th.ThaiWordFilter;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Locale;
|
|
||||||
import java.lang.Character.UnicodeBlock;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.text.BreakIterator;
|
|
||||||
import java.util.Map;
|
/** Factory for {@link ThaiWordFilter} */
|
||||||
public class ThaiWordFilterFactory extends BaseTokenFilterFactory {
|
public class ThaiWordFilterFactory extends BaseTokenFilterFactory {
|
||||||
public ThaiWordFilter create(TokenStream input) {
|
public ThaiWordFilter create(TokenStream input) {
|
||||||
return new ThaiWordFilter(input);
|
return new ThaiWordFilter(input);
|
||||||
|
|
|
@ -18,13 +18,11 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.payloads.*;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.Payload;
|
|
||||||
import java.io.IOException;
|
/** Factory for {@link TokenOffsetPayloadTokenFilter} */
|
||||||
import java.util.Map;
|
|
||||||
public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenOffsetPayloadTokenFilter create(TokenStream input) {
|
public TokenOffsetPayloadTokenFilter create(TokenStream input) {
|
||||||
return new TokenOffsetPayloadTokenFilter(input);
|
return new TokenOffsetPayloadTokenFilter(input);
|
||||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import org.apache.solr.core.SolrConfig;
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,6 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.NumericTokenStream;
|
import org.apache.lucene.analysis.NumericTokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
@ -41,8 +40,8 @@ public final class TrimFilter extends TokenFilter {
|
||||||
super(in);
|
super(in);
|
||||||
this.updateOffsets = updateOffsets;
|
this.updateOffsets = updateOffsets;
|
||||||
|
|
||||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
this.termAtt = addAttribute(TermAttribute.class);
|
||||||
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||||
|
|
||||||
/** Factory for TurkishLowerCaseFilter */
|
/** Factory for {@link TurkishLowerCaseFilter} */
|
||||||
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new TurkishLowerCaseFilter(input);
|
return new TurkishLowerCaseFilter(input);
|
||||||
|
|
|
@ -18,13 +18,11 @@
|
||||||
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.payloads.*;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.index.Payload;
|
/** Factory for {@link TypeAsPayloadTokenFilter} */
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Map;
|
|
||||||
public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
|
||||||
public TypeAsPayloadTokenFilter create(TokenStream input) {
|
public TypeAsPayloadTokenFilter create(TokenStream input) {
|
||||||
return new TypeAsPayloadTokenFilter(input);
|
return new TypeAsPayloadTokenFilter(input);
|
||||||
|
|
|
@ -17,17 +17,22 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class WhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
public class WhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
||||||
public WhitespaceTokenizer create(Reader input) {
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public WhitespaceTokenizer create(Reader input) {
|
||||||
return new WhitespaceTokenizer(luceneMatchVersion,input);
|
return new WhitespaceTokenizer(luceneMatchVersion,input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -120,10 +120,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
final CharArraySet protWords;
|
final CharArraySet protWords;
|
||||||
|
|
||||||
private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
private final TermAttribute termAttribute = addAttribute(TermAttribute.class);
|
||||||
private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
|
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
// used for iterating word delimiter breaks
|
// used for iterating word delimiter breaks
|
||||||
private final WordDelimiterIterator iterator;
|
private final WordDelimiterIterator iterator;
|
||||||
|
|
|
@ -21,12 +21,8 @@ import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.io.File;
|
|
||||||
import java.util.List;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
@ -40,21 +36,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
||||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||||
if (wordFiles != null) {
|
if (wordFiles != null) {
|
||||||
try {
|
try {
|
||||||
File protectedWordFiles = new File(wordFiles);
|
protectedWords = getWordSet(loader, wordFiles, false);
|
||||||
if (protectedWordFiles.exists()) {
|
|
||||||
List<String> wlist = loader.getLines(wordFiles);
|
|
||||||
//This cast is safe in Lucene
|
|
||||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
|
||||||
} else {
|
|
||||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> wlist = loader.getLines(file.trim());
|
|
||||||
if (protectedWords == null)
|
|
||||||
protectedWords = new CharArraySet(wlist, false);
|
|
||||||
else
|
|
||||||
protectedWords.addAll(wlist);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,411 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.util;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple class that stores key Strings as char[]'s in a
|
|
||||||
* hash table. Note that this is not a general purpose
|
|
||||||
* class. For example, it cannot remove items from the
|
|
||||||
* map, nor does it resize its hash table to be smaller,
|
|
||||||
* etc. It is designed to be quick to retrieve items
|
|
||||||
* by char[] keys without the necessity of converting
|
|
||||||
* to a String first.
|
|
||||||
*/
|
|
||||||
|
|
||||||
public class CharArrayMap<V> extends AbstractMap<String, V>
|
|
||||||
implements Map<String, V>, Cloneable, Serializable
|
|
||||||
{
|
|
||||||
private final static int INIT_SIZE = 2;
|
|
||||||
private char[][] keys;
|
|
||||||
private Object[] values;
|
|
||||||
private int count;
|
|
||||||
private final boolean ignoreCase;
|
|
||||||
|
|
||||||
/** Create map with enough capacity to hold startSize
|
|
||||||
* terms */
|
|
||||||
public CharArrayMap(int initialCapacity, boolean ignoreCase) {
|
|
||||||
this.ignoreCase = ignoreCase;
|
|
||||||
int size = INIT_SIZE;
|
|
||||||
// load factor of .75, inverse is 1.25, or x+x/4
|
|
||||||
initialCapacity = initialCapacity + (initialCapacity >>2);
|
|
||||||
while(size <= initialCapacity)
|
|
||||||
size <<= 1;
|
|
||||||
keys = new char[size][];
|
|
||||||
values = new Object[size];
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean ignoreCase() {
|
|
||||||
return ignoreCase;
|
|
||||||
}
|
|
||||||
|
|
||||||
public V get(char[] key) {
|
|
||||||
return get(key, 0, key.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
public V get(char[] key, int off, int len) {
|
|
||||||
return (V)values[getSlot(key, off, len)];
|
|
||||||
}
|
|
||||||
|
|
||||||
public V get(CharSequence key) {
|
|
||||||
return (V)values[getSlot(key)];
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public V get(Object key) {
|
|
||||||
return (V)values[getSlot(key)];
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean containsKey(Object s) {
|
|
||||||
return keys[getSlot(s)] != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean containsValue(Object value) {
|
|
||||||
if (value == null) {
|
|
||||||
// search for key with a null value
|
|
||||||
for (int i=0; i<keys.length; i++) {
|
|
||||||
if (keys[i] != null && values[i] == null) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i=0; i<values.length; i++) {
|
|
||||||
Object val = values[i];
|
|
||||||
if (val != null && value.equals(val)) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private int getSlot(Object key) {
|
|
||||||
if (key instanceof char[]) {
|
|
||||||
char[] keyc = (char[])key;
|
|
||||||
return getSlot(keyc, 0, keyc.length);
|
|
||||||
}
|
|
||||||
return getSlot((CharSequence)key);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getSlot(char[] key, int off, int len) {
|
|
||||||
int code = getHashCode(key, len);
|
|
||||||
int pos = code & (keys.length-1);
|
|
||||||
char[] key2 = keys[pos];
|
|
||||||
if (key2 != null && !equals(key, off, len, key2)) {
|
|
||||||
final int inc = ((code>>8)+code)|1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
pos = code & (keys.length-1);
|
|
||||||
key2 = keys[pos];
|
|
||||||
} while (key2 != null && !equals(key, off, len, key2));
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns true if the String is in the set */
|
|
||||||
private int getSlot(CharSequence key) {
|
|
||||||
int code = getHashCode(key);
|
|
||||||
int pos = code & (keys.length-1);
|
|
||||||
char[] key2 = keys[pos];
|
|
||||||
if (key2 != null && !equals(key, key2)) {
|
|
||||||
final int inc = ((code>>8)+code)|1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
pos = code & (keys.length-1);
|
|
||||||
key2 = keys[pos];
|
|
||||||
} while (key2 != null && !equals(key, key2));
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
public V put(CharSequence key, V val) {
|
|
||||||
return put(key.toString(), val); // could be more efficient
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public V put(String key, V val) {
|
|
||||||
return put(key.toCharArray(), val);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Add this key,val pair to the map.
|
|
||||||
* The char[] key is directly used, no copy is made.
|
|
||||||
* If ignoreCase is true for this Map, the key array will be directly modified.
|
|
||||||
* The user should never modify the key after calling this method.
|
|
||||||
*/
|
|
||||||
public V put(char[] key, Object val) {
|
|
||||||
if (ignoreCase)
|
|
||||||
for(int i=0;i< key.length;i++)
|
|
||||||
key[i] = Character.toLowerCase(key[i]);
|
|
||||||
int slot = getSlot(key, 0, key.length);
|
|
||||||
if (keys[slot] == null) count++;
|
|
||||||
Object prev = values[slot];
|
|
||||||
keys[slot] = key;
|
|
||||||
values[slot] = val;
|
|
||||||
|
|
||||||
if (count + (count>>2) >= keys.length) {
|
|
||||||
rehash();
|
|
||||||
}
|
|
||||||
|
|
||||||
return (V)prev;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean equals(char[] text1, int off, int len, char[] text2) {
|
|
||||||
if (len != text2.length)
|
|
||||||
return false;
|
|
||||||
if (ignoreCase) {
|
|
||||||
for(int i=0;i<len;i++) {
|
|
||||||
if (Character.toLowerCase(text1[off+i]) != text2[i])
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for(int i=0;i<len;i++) {
|
|
||||||
if (text1[off+i] != text2[i])
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean equals(CharSequence text1, char[] text2) {
|
|
||||||
int len = text1.length();
|
|
||||||
if (len != text2.length)
|
|
||||||
return false;
|
|
||||||
if (ignoreCase) {
|
|
||||||
for(int i=0;i<len;i++) {
|
|
||||||
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for(int i=0;i<len;i++) {
|
|
||||||
if (text1.charAt(i) != text2[i])
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void rehash() {
|
|
||||||
final int newSize = 2* keys.length;
|
|
||||||
char[][] oldEntries = keys;
|
|
||||||
Object[] oldValues = values;
|
|
||||||
keys = new char[newSize][];
|
|
||||||
values = new Object[newSize];
|
|
||||||
|
|
||||||
for(int i=0;i<oldEntries.length;i++) {
|
|
||||||
char[] key = oldEntries[i];
|
|
||||||
if (key != null) {
|
|
||||||
// todo: could be faster... no need to compare keys on collision
|
|
||||||
// since they are unique
|
|
||||||
int newSlot = getSlot(key,0,key.length);
|
|
||||||
keys[newSlot] = key;
|
|
||||||
values[newSlot] = oldValues[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getHashCode(char[] text, int len) {
|
|
||||||
int code = 0;
|
|
||||||
if (ignoreCase) {
|
|
||||||
for (int i=0; i<len; i++) {
|
|
||||||
code = code*31 + Character.toLowerCase(text[i]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i=0; i<len; i++) {
|
|
||||||
code = code*31 + text[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getHashCode(CharSequence text) {
|
|
||||||
int code;
|
|
||||||
if (ignoreCase) {
|
|
||||||
code = 0;
|
|
||||||
int len = text.length();
|
|
||||||
for (int i=0; i<len; i++) {
|
|
||||||
code = code*31 + Character.toLowerCase(text.charAt(i));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (false && text instanceof String) {
|
|
||||||
code = text.hashCode();
|
|
||||||
} else {
|
|
||||||
code = 0;
|
|
||||||
int len = text.length();
|
|
||||||
for (int i=0; i<len; i++) {
|
|
||||||
code = code*31 + text.charAt(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int size() {
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isEmpty() {
|
|
||||||
return count==0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void clear() {
|
|
||||||
count = 0;
|
|
||||||
Arrays.fill(keys,null);
|
|
||||||
Arrays.fill(values,null);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Set<Entry<String, V>> entrySet() {
|
|
||||||
return new EntrySet();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns an EntryIterator over this Map. */
|
|
||||||
public EntryIterator iterator() {
|
|
||||||
return new EntryIterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** public iterator class so efficient methods are exposed to users */
|
|
||||||
public class EntryIterator implements Iterator<Map.Entry<String,V>> {
|
|
||||||
int pos=-1;
|
|
||||||
int lastPos;
|
|
||||||
|
|
||||||
EntryIterator() {
|
|
||||||
goNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void goNext() {
|
|
||||||
lastPos = pos;
|
|
||||||
pos++;
|
|
||||||
while (pos < keys.length && keys[pos] == null) pos++;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasNext() {
|
|
||||||
return pos < keys.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** gets the next key... do not modify the returned char[] */
|
|
||||||
public char[] nextKey() {
|
|
||||||
goNext();
|
|
||||||
return keys[lastPos];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** gets the next key as a newly created String object */
|
|
||||||
public String nextKeyString() {
|
|
||||||
return new String(nextKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
/** returns the value associated with the last key returned */
|
|
||||||
public V currentValue() {
|
|
||||||
return (V)values[lastPos];
|
|
||||||
}
|
|
||||||
|
|
||||||
/** sets the value associated with the last key returned */
|
|
||||||
public V setValue(V value) {
|
|
||||||
V old = (V)values[lastPos];
|
|
||||||
values[lastPos] = value;
|
|
||||||
return old;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns an Entry<String,V> object created on the fly...
|
|
||||||
* use nextCharArray() + currentValie() for better efficiency. */
|
|
||||||
public Map.Entry<String,V> next() {
|
|
||||||
goNext();
|
|
||||||
return new MapEntry(lastPos);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private class MapEntry implements Map.Entry<String,V> {
|
|
||||||
final int pos;
|
|
||||||
|
|
||||||
MapEntry(int pos) {
|
|
||||||
this.pos = pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
public char[] getCharArr() {
|
|
||||||
return keys[pos];
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getKey() {
|
|
||||||
return new String(getCharArr());
|
|
||||||
}
|
|
||||||
|
|
||||||
public V getValue() {
|
|
||||||
return (V)values[pos];
|
|
||||||
}
|
|
||||||
|
|
||||||
public V setValue(V value) {
|
|
||||||
V old = (V)values[pos];
|
|
||||||
values[pos] = value;
|
|
||||||
return old;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return getKey() + '=' + getValue();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private class EntrySet extends AbstractSet<Map.Entry<String, V>> {
|
|
||||||
public EntryIterator iterator() {
|
|
||||||
return new EntryIterator();
|
|
||||||
}
|
|
||||||
public boolean contains(Object o) {
|
|
||||||
if (!(o instanceof Map.Entry))
|
|
||||||
return false;
|
|
||||||
Map.Entry e = (Map.Entry)o;
|
|
||||||
Object key = e.getKey();
|
|
||||||
if (key==null) return false; // we don't support null keys
|
|
||||||
Object val = e.getValue();
|
|
||||||
Object v = get(key);
|
|
||||||
return v==null ? val==null : v.equals(val);
|
|
||||||
}
|
|
||||||
public boolean remove(Object o) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
public int size() {
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
public void clear() {
|
|
||||||
CharArrayMap.this.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Object clone() {
|
|
||||||
CharArrayMap<V> map = null;
|
|
||||||
try {
|
|
||||||
map = (CharArrayMap<V>)super.clone();
|
|
||||||
map.keys = keys.clone();
|
|
||||||
map.values = values.clone();
|
|
||||||
} catch (CloneNotSupportedException e) {
|
|
||||||
// impossible
|
|
||||||
}
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -21,13 +21,18 @@ import java.util.Collections;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.apache.solr.core.Config;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* General token testing helper functions
|
* General token testing helper functions
|
||||||
*/
|
*/
|
||||||
public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase
|
public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase
|
||||||
{
|
{
|
||||||
|
/** a map containing the default test version param for easy testing */
|
||||||
protected static final Map<String,String> DEFAULT_VERSION_PARAM =
|
protected static final Map<String,String> DEFAULT_VERSION_PARAM =
|
||||||
Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT"));
|
Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT"));
|
||||||
|
|
||||||
|
/** The default test version for easy testing */
|
||||||
|
public static final Version DEFAULT_VERSION = Config.parseLuceneVersionString(DEFAULT_VERSION_PARAM.get("luceneMatchVersion"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,12 +39,12 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||||
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put("words", "stop-1.txt");
|
args.put("words", "stop-1.txt");
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set words = factory.getCommonWords();
|
Set<?> words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||||
words.size() == 2);
|
words.size() == 2);
|
||||||
|
@ -71,13 +71,13 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||||
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set words = factory.getCommonWords();
|
Set<?> words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue(words.contains("the"));
|
assertTrue(words.contains("the"));
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
|
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
|
||||||
|
|
|
@ -35,10 +35,10 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
final String input = "How the s a brown s cow d like A B thing?";
|
final String input = "How the s a brown s cow d like A B thing?";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
|
|
||||||
TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
|
TermAttribute term = cgf.addAttribute(TermAttribute.class);
|
||||||
assertTrue(cgf.incrementToken());
|
assertTrue(cgf.incrementToken());
|
||||||
assertEquals("How", term.term());
|
assertEquals("How", term.term());
|
||||||
assertTrue(cgf.incrementToken());
|
assertTrue(cgf.incrementToken());
|
||||||
|
@ -56,11 +56,11 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testQueryReset() throws Exception {
|
public void testQueryReset() throws Exception {
|
||||||
final String input = "How the s a brown s cow d like A B thing?";
|
final String input = "How the s a brown s cow d like A B thing?";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
|
|
||||||
TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
|
TermAttribute term = wt.addAttribute(TermAttribute.class);
|
||||||
assertTrue(nsf.incrementToken());
|
assertTrue(nsf.incrementToken());
|
||||||
assertEquals("How_the", term.term());
|
assertEquals("How_the", term.term());
|
||||||
assertTrue(nsf.incrementToken());
|
assertTrue(nsf.incrementToken());
|
||||||
|
@ -88,7 +88,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String field, Reader in) {
|
public TokenStream tokenStream(String field, Reader in) {
|
||||||
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
||||||
new WhitespaceTokenizer(in), commonWords));
|
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -157,7 +157,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String field, Reader in) {
|
public TokenStream tokenStream(String field, Reader in) {
|
||||||
return new CommonGramsFilter(
|
return new CommonGramsFilter(
|
||||||
new WhitespaceTokenizer(in), commonWords);
|
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -243,7 +243,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testCaseSensitive() throws Exception {
|
public void testCaseSensitive() throws Exception {
|
||||||
final String input = "How The s a brown s cow d like A B thing?";
|
final String input = "How The s a brown s cow d like A B thing?";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
||||||
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
||||||
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
||||||
|
@ -256,7 +256,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testLastWordisStopWord() throws Exception {
|
public void testLastWordisStopWord() throws Exception {
|
||||||
final String input = "dog the";
|
final String input = "dog the";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
||||||
|
@ -267,7 +267,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testFirstWordisStopWord() throws Exception {
|
public void testFirstWordisStopWord() throws Exception {
|
||||||
final String input = "the dog";
|
final String input = "the dog";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
||||||
|
@ -278,7 +278,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testOneWordQueryStopWord() throws Exception {
|
public void testOneWordQueryStopWord() throws Exception {
|
||||||
final String input = "the";
|
final String input = "the";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "the" });
|
assertTokenStreamContents(nsf, new String[] { "the" });
|
||||||
|
@ -289,7 +289,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testOneWordQuery() throws Exception {
|
public void testOneWordQuery() throws Exception {
|
||||||
final String input = "monster";
|
final String input = "monster";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "monster" });
|
assertTokenStreamContents(nsf, new String[] { "monster" });
|
||||||
|
@ -300,7 +300,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void TestFirstAndLastStopWord() throws Exception {
|
public void TestFirstAndLastStopWord() throws Exception {
|
||||||
final String input = "the of";
|
final String input = "the of";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
||||||
|
|
|
@ -38,12 +38,12 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||||
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put("words", "stop-1.txt");
|
args.put("words", "stop-1.txt");
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set words = factory.getCommonWords();
|
Set<?> words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||||
words.size() == 2);
|
words.size() == 2);
|
||||||
|
@ -70,13 +70,13 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||||
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set words = factory.getCommonWords();
|
Set<?> words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue(words.contains("the"));
|
assertTrue(words.contains("the"));
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory"));
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "testing_the", "the_factory" });
|
new String[] { "testing_the", "the_factory" });
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
||||||
public void testDefaults() throws Exception {
|
public void testDefaults() throws Exception {
|
||||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||||
factory.init(new HashMap<String, String>());
|
factory.init(new HashMap<String, String>());
|
||||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||||
|
|
||||||
TokenStream filteredStream = factory.create(inputStream);
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||||
|
@ -43,7 +43,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
||||||
parameters.put("maxCodeLength", "8");
|
parameters.put("maxCodeLength", "8");
|
||||||
factory.init(parameters);
|
factory.init(parameters);
|
||||||
|
|
||||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||||
|
|
||||||
TokenStream filteredStream = factory.create(inputStream);
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||||
|
@ -56,10 +56,10 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||||
factory.init(new HashMap<String, String>());
|
factory.init(new HashMap<String, String>());
|
||||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||||
|
|
||||||
TokenStream filteredStream = factory.create(inputStream);
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
|
TermAttribute termAtt = filteredStream.addAttribute(TermAttribute.class);
|
||||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||||
|
|
||||||
assertTrue(filteredStream.incrementToken());
|
assertTrue(filteredStream.incrementToken());
|
||||||
|
|
|
@ -24,42 +24,42 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
|
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testSize4FalseInject() throws Exception {
|
public void testSize4FalseInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSize4TrueInject() throws Exception {
|
public void testSize4TrueInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||||
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAlternateInjectFalse() throws Exception {
|
public void testAlternateInjectFalse() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
|
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSize8FalseInject() throws Exception {
|
public void testSize8FalseInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNonConvertableStringsWithInject() throws Exception {
|
public void testNonConvertableStringsWithInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
|
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
|
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||||
|
|
||||||
// should have something after the stream
|
// should have something after the stream
|
||||||
stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
|
stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
|
||||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,11 +46,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
|
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, gold);
|
assertTokenStreamContents(stream, gold);
|
||||||
|
@ -71,13 +71,13 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
List<String> lines = new ArrayList<String>();
|
List<String> lines = new ArrayList<String>();
|
||||||
Collections.addAll(lines, "banks", "fledgling");
|
Collections.addAll(lines, "banks", "fledgling");
|
||||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, gold);
|
assertTokenStreamContents(stream, gold);
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class LengthFilterTest extends BaseTokenTestCase {
|
||||||
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
String test = "foo foobar super-duper-trooper";
|
String test = "foo foobar super-duper-trooper";
|
||||||
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
|
||||||
assertTokenStreamContents(stream, new String[] { "foobar" });
|
assertTokenStreamContents(stream, new String[] { "foobar" });
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -48,12 +48,12 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put("language", "English");
|
args.put("language", "English");
|
||||||
|
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, gold);
|
assertTokenStreamContents(stream, gold);
|
||||||
|
@ -78,13 +78,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
List<String> lines = new ArrayList<String>();
|
List<String> lines = new ArrayList<String>();
|
||||||
Collections.addAll(lines, "banks", "fledgling");
|
Collections.addAll(lines, "banks", "fledgling");
|
||||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION,
|
||||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, gold);
|
assertTokenStreamContents(stream, gold);
|
||||||
|
@ -116,13 +116,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||||
public void testProtected() throws Exception {
|
public void testProtected() throws Exception {
|
||||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put("protected", "protwords.txt");
|
args.put("protected", "protwords.txt");
|
||||||
args.put("language", "English");
|
args.put("language", "English");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Reader reader = new StringReader("ridding of some stemming");
|
Reader reader = new StringReader("ridding of some stemming");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
|
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("Brasília");
|
Reader reader = new StringReader("Brasília");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
|
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "brasil" });
|
assertTokenStreamContents(stream, new String[] { "brasil" });
|
||||||
|
|
|
@ -59,7 +59,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
||||||
final String input = "How now A B brown A cow B like A B thing?";
|
final String input = "How now A B brown A cow B like A B thing?";
|
||||||
final String expected = "How now Q B brown A cow B like Q B thing?";
|
final String expected = "How now Q B brown A cow B like Q B thing?";
|
||||||
TokenStream ts = new AB_Q_Stream
|
TokenStream ts = new AB_Q_Stream
|
||||||
(new WhitespaceTokenizer(new StringReader(input)));
|
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,15 +67,15 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
||||||
final String input = "How now A B brown A cow B like A B thing?";
|
final String input = "How now A B brown A cow B like A B thing?";
|
||||||
final String expected = "How now A A B brown A cow B like A A B thing?";
|
final String expected = "How now A A B brown A cow B like A A B thing?";
|
||||||
TokenStream ts = new AB_AAB_Stream
|
TokenStream ts = new AB_AAB_Stream
|
||||||
(new WhitespaceTokenizer(new StringReader(input)));
|
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
final String input = "How now A B brown A cow B like A B thing?";
|
final String input = "How now A B brown A cow B like A B thing?";
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
TokenStream ts = new AB_AAB_Stream(tokenizer);
|
TokenStream ts = new AB_AAB_Stream(tokenizer);
|
||||||
TermAttribute term = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
TermAttribute term = ts.addAttribute(TermAttribute.class);
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals("How", term.term());
|
assertEquals("How", term.term());
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestBulgarianStemFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("компютри");
|
Reader reader = new StringReader("компютри");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory();
|
BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "компютр" });
|
assertTokenStreamContents(stream, new String[] { "компютр" });
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testCapitalization() throws Exception
|
public void testCapitalization() throws Exception
|
||||||
{
|
{
|
||||||
Map<String,String> args = new HashMap<String, String>();
|
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" );
|
args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" );
|
||||||
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
|
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
|
||||||
|
|
||||||
|
@ -74,18 +74,18 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
|
|
||||||
// now each token
|
// now each token
|
||||||
factory.onlyFirstWord = false;
|
factory.onlyFirstWord = false;
|
||||||
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
|
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
|
||||||
stream = factory.create(tokenizer);
|
stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
||||||
|
|
||||||
// now only the long words
|
// now only the long words
|
||||||
factory.minWordLength = 3;
|
factory.minWordLength = 3;
|
||||||
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
|
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
|
||||||
stream = factory.create(tokenizer);
|
stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
||||||
|
|
||||||
// without prefix
|
// without prefix
|
||||||
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
|
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
||||||
stream = factory.create(tokenizer);
|
stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "Mckinley" });
|
assertTokenStreamContents(stream, new String[] { "Mckinley" });
|
||||||
|
|
||||||
|
@ -93,14 +93,14 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
factory = new CapitalizationFilterFactory();
|
factory = new CapitalizationFilterFactory();
|
||||||
args.put( "okPrefix", "McK" ); // all words
|
args.put( "okPrefix", "McK" ); // all words
|
||||||
factory.init( args );
|
factory.init( args );
|
||||||
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
|
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
||||||
stream = factory.create(tokenizer);
|
stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "McKinley" });
|
assertTokenStreamContents(stream, new String[] { "McKinley" });
|
||||||
|
|
||||||
// now try some stuff with numbers
|
// now try some stuff with numbers
|
||||||
factory.forceFirstLetter = false;
|
factory.forceFirstLetter = false;
|
||||||
factory.onlyFirstWord = false;
|
factory.onlyFirstWord = false;
|
||||||
tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
|
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
|
||||||
stream = factory.create(tokenizer);
|
stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
|
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
|
||||||
|
|
||||||
|
@ -111,7 +111,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testKeepIgnoreCase() throws Exception {
|
public void testKeepIgnoreCase() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String, String>();
|
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put( CapitalizationFilterFactory.KEEP, "kitten" );
|
args.put( CapitalizationFilterFactory.KEEP, "kitten" );
|
||||||
args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" );
|
args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" );
|
||||||
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
|
args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" );
|
||||||
|
@ -141,12 +141,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
* This is very weird when combined with ONLY_FIRST_WORD!!!
|
* This is very weird when combined with ONLY_FIRST_WORD!!!
|
||||||
*/
|
*/
|
||||||
public void testMinWordLength() throws Exception {
|
public void testMinWordLength() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
|
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
|
||||||
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
|
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
|
||||||
"helo testing"));
|
"helo testing"));
|
||||||
TokenStream ts = factory.create(tokenizer);
|
TokenStream ts = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
|
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
|
||||||
|
@ -157,11 +157,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
* in each token (it should do nothing)
|
* in each token (it should do nothing)
|
||||||
*/
|
*/
|
||||||
public void testMaxWordCount() throws Exception {
|
public void testMaxWordCount() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
|
||||||
"one two three four"));
|
"one two three four"));
|
||||||
TokenStream ts = factory.create(tokenizer);
|
TokenStream ts = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
|
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
|
||||||
|
@ -171,7 +171,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
|
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
|
||||||
*/
|
*/
|
||||||
public void testMaxWordCount2() throws Exception {
|
public void testMaxWordCount2() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
|
@ -187,11 +187,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
* This is weird, it is not really a max, but inclusive (look at 'is')
|
* This is weird, it is not really a max, but inclusive (look at 'is')
|
||||||
*/
|
*/
|
||||||
public void testMaxTokenLength() throws Exception {
|
public void testMaxTokenLength() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
|
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(
|
||||||
"this is a test"));
|
"this is a test"));
|
||||||
TokenStream ts = factory.create(tokenizer);
|
TokenStream ts = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
|
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
|
||||||
|
@ -201,12 +201,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
* Test CapitalizationFilterFactory's forceFirstLetter option
|
* Test CapitalizationFilterFactory's forceFirstLetter option
|
||||||
*/
|
*/
|
||||||
public void testForceFirstLetter() throws Exception {
|
public void testForceFirstLetter() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put(CapitalizationFilterFactory.KEEP, "kitten");
|
args.put(CapitalizationFilterFactory.KEEP, "kitten");
|
||||||
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
|
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten"));
|
||||||
TokenStream ts = factory.create(tokenizer);
|
TokenStream ts = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(ts, new String[] {"Kitten"});
|
assertTokenStreamContents(ts, new String[] {"Kitten"});
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestChineseFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testFiltering() throws Exception {
|
public void testFiltering() throws Exception {
|
||||||
Reader reader = new StringReader("this 1234 Is such a silly filter");
|
Reader reader = new StringReader("this 1234 Is such a silly filter");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
ChineseFilterFactory factory = new ChineseFilterFactory();
|
ChineseFilterFactory factory = new ChineseFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
|
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
|
||||||
|
|
|
@ -177,9 +177,9 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
|
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
TermAttribute term1 = (TermAttribute) stream1
|
TermAttribute term1 = stream1
|
||||||
.addAttribute(TermAttribute.class);
|
.addAttribute(TermAttribute.class);
|
||||||
TermAttribute term2 = (TermAttribute) stream2
|
TermAttribute term2 = stream2
|
||||||
.addAttribute(TermAttribute.class);
|
.addAttribute(TermAttribute.class);
|
||||||
assertTrue(stream1.incrementToken());
|
assertTrue(stream1.incrementToken());
|
||||||
assertTrue(stream2.incrementToken());
|
assertTrue(stream2.incrementToken());
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestCzechStemFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("angličtí");
|
Reader reader = new StringReader("angličtí");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
CzechStemFilterFactory factory = new CzechStemFilterFactory();
|
CzechStemFilterFactory factory = new CzechStemFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "anglick" });
|
assertTokenStreamContents(stream, new String[] { "anglick" });
|
||||||
|
|
|
@ -21,8 +21,6 @@ import java.io.StringReader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||||
|
@ -32,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.core.SolrResourceLoader;
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
|
public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testEncoder() throws Exception {
|
public void testEncoder() throws Exception {
|
||||||
Map<String,String> args = new HashMap<String, String>();
|
Map<String,String> args = new HashMap<String, String>();
|
||||||
|
@ -42,10 +40,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
|
|
||||||
TokenStream input = new WhitespaceTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1"));
|
TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the|0.1 quick|0.1 red|0.1"));
|
||||||
DelimitedPayloadTokenFilter tf = factory.create(input);
|
DelimitedPayloadTokenFilter tf = factory.create(input);
|
||||||
while (tf.incrementToken()){
|
while (tf.incrementToken()){
|
||||||
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
|
||||||
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
|
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
|
||||||
byte[] payData = payAttr.getPayload().getData();
|
byte[] payData = payAttr.getPayload().getData();
|
||||||
assertTrue("payData is null and it shouldn't be", payData != null);
|
assertTrue("payData is null and it shouldn't be", payData != null);
|
||||||
|
@ -64,10 +62,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
|
|
||||||
TokenStream input = new WhitespaceTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"));
|
TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the*0.1 quick*0.1 red*0.1"));
|
||||||
DelimitedPayloadTokenFilter tf = factory.create(input);
|
DelimitedPayloadTokenFilter tf = factory.create(input);
|
||||||
while (tf.incrementToken()){
|
while (tf.incrementToken()){
|
||||||
PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class);
|
||||||
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
|
assertTrue("payAttr is null and it shouldn't be", payAttr != null);
|
||||||
byte[] payData = payAttr.getPayload().getData();
|
byte[] payData = payAttr.getPayload().getData();
|
||||||
assertTrue("payData is null and it shouldn't be", payData != null);
|
assertTrue("payData is null and it shouldn't be", payData != null);
|
||||||
|
|
|
@ -37,10 +37,10 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestC
|
||||||
*/
|
*/
|
||||||
public void testDecompounding() throws Exception {
|
public void testDecompounding() throws Exception {
|
||||||
Reader reader = new StringReader("I like to play softball");
|
Reader reader = new StringReader("I like to play softball");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
|
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
Map<String,String> args = new HashMap<String,String>();
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put("dictionary", "compoundDictionary.txt");
|
args.put("dictionary", "compoundDictionary.txt");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestDutchStemFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("lichamelijkheden");
|
Reader reader = new StringReader("lichamelijkheden");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
DutchStemFilterFactory factory = new DutchStemFilterFactory();
|
DutchStemFilterFactory factory = new DutchStemFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "licham" });
|
assertTokenStreamContents(stream, new String[] { "licham" });
|
||||||
|
|
|
@ -37,7 +37,7 @@ public class TestElisionFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testElision() throws Exception {
|
public void testElision() throws Exception {
|
||||||
Reader reader = new StringReader("l'avion");
|
Reader reader = new StringReader("l'avion");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
ElisionFilterFactory factory = new ElisionFilterFactory();
|
ElisionFilterFactory factory = new ElisionFilterFactory();
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("habitable");
|
Reader reader = new StringReader("habitable");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
|
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "habit" });
|
assertTokenStreamContents(stream, new String[] { "habit" });
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestGermanStemFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("Tischen");
|
Reader reader = new StringReader("Tischen");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
GermanStemFilterFactory factory = new GermanStemFilterFactory();
|
GermanStemFilterFactory factory = new GermanStemFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "tisch" });
|
assertTokenStreamContents(stream, new String[] { "tisch" });
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
|
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
|
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
|
||||||
TokenStream stream = factory.create(tokenizer);
|
TokenStream stream = factory.create(tokenizer);
|
||||||
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
|
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||||
public void testHyphenatedWords() throws Exception {
|
public void testHyphenatedWords() throws Exception {
|
||||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||||
// first test
|
// first test
|
||||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||||
ts = factory.create(ts);
|
ts = factory.create(ts);
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||||
public void testHyphenAtEnd() throws Exception {
|
public void testHyphenAtEnd() throws Exception {
|
||||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||||
// first test
|
// first test
|
||||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||||
ts = factory.create(ts);
|
ts = factory.create(ts);
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
|
|
@ -23,25 +23,22 @@ import java.util.Set;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class TestKeepFilterFactory extends TestCase{
|
public class TestKeepFilterFactory extends BaseTokenTestCase{
|
||||||
|
|
||||||
public void testInform() throws Exception {
|
public void testInform() throws Exception {
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||||
KeepWordFilterFactory factory = new KeepWordFilterFactory();
|
KeepWordFilterFactory factory = new KeepWordFilterFactory();
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put("words", "keep-1.txt");
|
args.put("words", "keep-1.txt");
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set words = factory.getWords();
|
Set<?> words = factory.getWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
|
||||||
words.add( "bbb" );
|
words.add( "bbb" );
|
||||||
|
|
||||||
String input = "aaa BBB ccc ddd EEE";
|
String input = "aaa BBB ccc ddd EEE";
|
||||||
Map<String,String> args = new HashMap<String, String>();
|
Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
|
|
||||||
// Test Stopwords
|
// Test Stopwords
|
||||||
|
@ -51,29 +51,29 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
|
||||||
factory.inform( loader );
|
factory.inform( loader );
|
||||||
factory.setWords( words );
|
factory.setWords( words );
|
||||||
assertTrue(factory.isIgnoreCase());
|
assertTrue(factory.isIgnoreCase());
|
||||||
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||||
|
|
||||||
// Test Stopwords (ignoreCase via the setter instead)
|
// Test Stopwords (ignoreCase via the setter instead)
|
||||||
factory = new KeepWordFilterFactory();
|
factory = new KeepWordFilterFactory();
|
||||||
args = new HashMap<String, String>();
|
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
factory.init( args );
|
factory.init( args );
|
||||||
factory.inform( loader );
|
factory.inform( loader );
|
||||||
factory.setIgnoreCase(true);
|
factory.setIgnoreCase(true);
|
||||||
factory.setWords( words );
|
factory.setWords( words );
|
||||||
assertTrue(factory.isIgnoreCase());
|
assertTrue(factory.isIgnoreCase());
|
||||||
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||||
|
|
||||||
// Now force case
|
// Now force case
|
||||||
factory = new KeepWordFilterFactory();
|
factory = new KeepWordFilterFactory();
|
||||||
args = new HashMap<String, String>();
|
args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
args.put( "ignoreCase", "false" );
|
args.put( "ignoreCase", "false" );
|
||||||
factory.init( args );
|
factory.init( args );
|
||||||
factory.inform( loader );
|
factory.inform( loader );
|
||||||
factory.setWords( words );
|
factory.setWords( words );
|
||||||
assertFalse(factory.isIgnoreCase());
|
assertFalse(factory.isIgnoreCase());
|
||||||
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa" });
|
assertTokenStreamContents(stream, new String[] { "aaa" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.PorterStemFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the keyword marker filter factory is working.
|
||||||
|
*/
|
||||||
|
public class TestKeywordMarkerFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testKeywords() throws IOException {
|
||||||
|
Reader reader = new StringReader("dogs cats");
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
|
args.put("protected", "protwords.txt");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(loader);
|
||||||
|
|
||||||
|
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||||
|
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKeywordsCaseInsensitive() throws IOException {
|
||||||
|
Reader reader = new StringReader("dogs cats Cats");
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||||
|
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||||
|
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||||
|
args.put("protected", "protwords.txt");
|
||||||
|
args.put("ignoreCase", "true");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(loader);
|
||||||
|
|
||||||
|
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||||
|
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,7 +20,7 @@ public class TestMultiWordSynonyms extends BaseTokenTestCase {
|
||||||
SynonymMap synMap = new SynonymMap(true);
|
SynonymMap synMap = new SynonymMap(true);
|
||||||
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
||||||
|
|
||||||
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
|
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
|
||||||
// This fails because ["e","e"] is the value of the token stream
|
// This fails because ["e","e"] is the value of the token stream
|
||||||
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue