From 3860c16a66bfa3db8daf9515c53a5aa142097b08 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 1 Apr 2010 02:15:27 +0000 Subject: [PATCH] SOLR-1857: cleanup and sync analysis with Lucene trunk git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@929782 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 8 + .../analysis/ASCIIFoldingFilterFactory.java | 7 +- .../ArabicLetterTokenizerFactory.java | 4 +- .../ArabicNormalizationFilterFactory.java | 3 +- .../analysis/ArabicStemFilterFactory.java | 3 +- .../solr/analysis/BaseTokenStreamFactory.java | 28 +- .../analysis/BrazilianStemFilterFactory.java | 11 +- .../solr/analysis/BufferedTokenStream.java | 12 +- .../analysis/BulgarianStemFilterFactory.java | 2 +- .../solr/analysis/CJKTokenizerFactory.java | 8 +- .../analysis/CapitalizationFilterFactory.java | 5 +- .../solr/analysis/ChineseFilterFactory.java | 12 +- .../analysis/ChineseTokenizerFactory.java | 11 +- .../solr/analysis/CommonGramsFilter.java | 35 +- .../analysis/CommonGramsFilterFactory.java | 17 +- .../solr/analysis/CommonGramsQueryFilter.java | 4 +- .../CommonGramsQueryFilterFactory.java | 27 +- .../solr/analysis/CzechStemFilterFactory.java | 2 +- .../DelimitedPayloadTokenFilterFactory.java | 2 +- ...tionaryCompoundWordTokenFilterFactory.java | 14 +- .../solr/analysis/DoubleMetaphoneFilter.java | 6 +- .../solr/analysis/DutchStemFilterFactory.java | 20 +- .../solr/analysis/ElisionFilterFactory.java | 18 +- .../analysis/EnglishPorterFilterFactory.java | 41 +- .../analysis/FrenchStemFilterFactory.java | 19 +- .../analysis/GermanStemFilterFactory.java | 10 +- .../analysis/GreekLowerCaseFilterFactory.java | 1 + .../HindiNormalizationFilterFactory.java | 2 +- .../solr/analysis/HindiStemFilterFactory.java | 2 +- .../solr/analysis/HyphenatedWordsFilter.java | 4 +- .../HyphenatedWordsFilterFactory.java | 2 +- .../ISOLatin1AccentFilterFactory.java | 2 + .../IndicNormalizationFilterFactory.java | 2 +- .../solr/analysis/IndicTokenizerFactory.java | 2 +- .../apache/solr/analysis/KeepWordFilter.java | 6 +- .../solr/analysis/KeepWordFilterFactory.java | 32 +- .../analysis/KeywordMarkerFilterFactory.java | 55 +++ .../analysis/KeywordTokenizerFactory.java | 1 - .../solr/analysis/LetterTokenizerFactory.java | 10 +- .../solr/analysis/LowerCaseFilterFactory.java | 9 +- .../analysis/LowerCaseTokenizerFactory.java | 9 +- .../NumericPayloadTokenFilterFactory.java | 9 +- .../solr/analysis/PatternReplaceFilter.java | 5 +- .../solr/analysis/PatternTokenizer.java | 4 +- .../PersianNormalizationFilterFactory.java | 9 +- .../apache/solr/analysis/PhoneticFilter.java | 5 +- .../analysis/RemoveDuplicatesTokenFilter.java | 16 +- .../solr/analysis/ReversedWildcardFilter.java | 4 +- .../apache/solr/analysis/RussianCommon.java | 61 --- .../RussianLetterTokenizerFactory.java | 4 + .../RussianLowerCaseFilterFactory.java | 13 +- .../analysis/RussianStemFilterFactory.java | 15 +- .../solr/analysis/ShingleFilterFactory.java | 10 +- .../analysis/SnowballPorterFilterFactory.java | 76 +--- .../analysis/StandardTokenizerFactory.java | 9 +- .../StemmerOverrideFilterFactory.java | 68 +++ .../solr/analysis/StopFilterFactory.java | 25 +- .../apache/solr/analysis/SynonymFilter.java | 34 +- .../solr/analysis/SynonymFilterFactory.java | 3 +- .../org/apache/solr/analysis/SynonymMap.java | 9 +- .../solr/analysis/ThaiWordFilterFactory.java | 12 +- .../TokenOffsetPayloadTokenFilterFactory.java | 10 +- .../apache/solr/analysis/TokenizerChain.java | 1 - .../solr/analysis/TokenizerFactory.java | 1 - .../solr/analysis/TrieTokenizerFactory.java | 1 - .../org/apache/solr/analysis/TrimFilter.java | 5 +- .../TurkishLowerCaseFilterFactory.java | 2 +- .../TypeAsPayloadTokenFilterFactory.java | 10 +- .../analysis/WhitespaceTokenizerFactory.java | 9 +- .../solr/analysis/WordDelimiterFilter.java | 8 +- .../analysis/WordDelimiterFilterFactory.java | 20 +- .../org/apache/solr/util/CharArrayMap.java | 411 ------------------ .../solr/analysis/BaseTokenTestCase.java | 5 + .../CommonGramsFilterFactoryTest.java | 10 +- .../solr/analysis/CommonGramsFilterTest.java | 24 +- .../CommonGramsQueryFilterFactoryTest.java | 10 +- .../DoubleMetaphoneFilterFactoryTest.java | 8 +- .../analysis/DoubleMetaphoneFilterTest.java | 14 +- .../EnglishPorterFilterFactoryTest.java | 8 +- .../solr/analysis/LengthFilterTest.java | 2 +- .../SnowballPorterFilterFactoryTest.java | 12 +- .../TestBrazilianStemFilterFactory.java | 2 +- .../analysis/TestBufferedTokenStream.java | 8 +- .../TestBulgarianStemFilterFactory.java | 2 +- .../analysis/TestCapitalizationFilter.java | 32 +- .../analysis/TestChineseFilterFactory.java | 2 +- .../TestCollationKeyFilterFactory.java | 4 +- .../analysis/TestCzechStemFilterFactory.java | 2 +- ...estDelimitedPayloadTokenFilterFactory.java | 12 +- ...tionaryCompoundWordTokenFilterFactory.java | 4 +- .../analysis/TestDutchStemFilterFactory.java | 2 +- .../analysis/TestElisionFilterFactory.java | 2 +- .../analysis/TestFrenchStemFilterFactory.java | 2 +- .../analysis/TestGermanStemFilterFactory.java | 2 +- .../TestGreekLowerCaseFilterFactory.java | 2 +- .../analysis/TestHyphenatedWordsFilter.java | 4 +- .../solr/analysis/TestKeepFilterFactory.java | 9 +- .../solr/analysis/TestKeepWordFilter.java | 12 +- .../TestKeywordMarkerFilterFactory.java | 65 +++ .../solr/analysis/TestMultiWordSynonyms.java | 2 +- .../solr/analysis/TestNGramFilters.java | 10 +- .../TestPatternReplaceCharFilter.java | 18 +- .../analysis/TestPatternReplaceFilter.java | 10 +- .../analysis/TestPatternTokenizerFactory.java | 2 +- ...TestPersianNormalizationFilterFactory.java | 2 +- .../solr/analysis/TestPhoneticFilter.java | 2 +- .../analysis/TestPorterStemFilterFactory.java | 2 +- .../TestRemoveDuplicatesTokenFilter.java | 7 +- .../TestReverseStringFilterFactory.java | 2 +- .../TestReversedWildcardFilterFactory.java | 4 +- .../solr/analysis/TestRussianFilters.java | 2 - .../analysis/TestShingleFilterFactory.java | 6 +- .../solr/analysis/TestStandardFactories.java | 4 +- .../TestStemmerOverrideFilterFactory.java | 66 +++ .../solr/analysis/TestStopFilterFactory.java | 9 +- .../solr/analysis/TestSynonymFilter.java | 16 +- .../analysis/TestThaiWordFilterFactory.java | 2 +- .../apache/solr/analysis/TestTrimFilter.java | 12 +- .../TestTurkishLowerCaseFilterFactory.java | 2 +- .../analysis/TestWordDelimiterFilter.java | 10 +- .../DocumentAnalysisRequestHandlerTest.java | 8 +- .../FieldAnalysisRequestHandlerTest.java | 8 +- .../apache/solr/util/TestCharArrayMap.java | 209 --------- .../test/test-files/solr/conf/stemdict.txt | 22 + 124 files changed, 771 insertions(+), 1264 deletions(-) create mode 100644 solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java delete mode 100644 solr/src/java/org/apache/solr/analysis/RussianCommon.java create mode 100644 solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java delete mode 100755 solr/src/java/org/apache/solr/util/CharArrayMap.java create mode 100644 solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java create mode 100644 solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java delete mode 100755 solr/src/test/org/apache/solr/util/TestCharArrayMap.java create mode 100644 solr/src/test/test-files/solr/conf/stemdict.txt diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9b0bd8ef1a0..f30e87e2cd0 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -126,6 +126,14 @@ New Features * SOLR-1769: Solr 1.4 Replication - Repeater throwing NullPointerException (Jörgen Rydenius via noble) +* SOLR-1857: Synced Solr analysis with Lucene 3.1. Added KeywordMarkerFilterFactory + and StemmerOverrideFilterFactory, which can be used to tune stemming algorithms. + Added factories for Bulgarian, Czech, Hindi, and Turkish analysis. Improved the + performance of SnowballPorterFilterFactory. (rmuir) + +* SOLR-1657: Converted remaining TokenStreams to the Attributes-based API. All Solr + TokenFilters now support custom Attributes, and some have improved performance: + especially WordDelimiterFilter and CommonGramsFilter. (rmuir, cmale, uschindler) Optimizations ---------------------- diff --git a/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java index 20060158532..b07ba79ad01 100644 --- a/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java @@ -18,9 +18,10 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.*; -import org.apache.lucene.util.ArrayUtil; -import java.util.Map; +import org.apache.lucene.analysis.ASCIIFoldingFilter; +import org.apache.lucene.analysis.TokenStream; + +/** Factory for {@link ASCIIFoldingFilter} */ public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory { public ASCIIFoldingFilter create(TokenStream input) { return new ASCIIFoldingFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java index 0ee3d5ccca8..f22f2232e5e 100644 --- a/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java @@ -16,15 +16,13 @@ package org.apache.solr.analysis; * limitations under the License. */ -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; import java.io.Reader; /** - * - * + * Factory for {@link ArabicLetterTokenizer} **/ public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{ diff --git a/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java index 3ab4d9d1254..3c6ac90db9a 100644 --- a/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java @@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; /** - * - * + * Factory for {@link ArabicNormalizationFilter} **/ public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{ diff --git a/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java index db3bea240a4..4042bf2b348 100644 --- a/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java @@ -21,8 +21,7 @@ import org.apache.lucene.analysis.ar.ArabicStemFilter; /** - * - * + * Factory for {@link ArabicStemFilter} **/ public class ArabicStemFilterFactory extends BaseTokenFilterFactory{ diff --git a/solr/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java b/solr/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java index 21e9e4f5f9d..af01e6bb6a0 100644 --- a/solr/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java +++ b/solr/src/java/org/apache/solr/analysis/BaseTokenStreamFactory.java @@ -17,13 +17,17 @@ package org.apache.solr.analysis; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.util.StrUtils; import org.apache.solr.core.Config; -import org.apache.solr.common.SolrException; import org.apache.solr.schema.IndexSchema; +import java.io.IOException; +import java.util.List; import java.util.Map; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.util.Version; @@ -94,4 +98,22 @@ abstract class BaseTokenStreamFactory { return Boolean.parseBoolean(s); } + protected CharArraySet getWordSet(ResourceLoader loader, + String wordFiles, boolean ignoreCase) throws IOException { + assureMatchVersion(); + List files = StrUtils.splitFileNames(wordFiles); + CharArraySet words = null; + if (files.size() > 0) { + // default stopwords list has 35 or so words, but maybe don't make it that + // big to start + words = new CharArraySet(luceneMatchVersion, + files.size() * 10, ignoreCase); + for (String file : files) { + List wlist = loader.getLines(file.trim()); + words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist, + ignoreCase)); + } + } + return words; + } } diff --git a/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java index 1045ca2c466..26ca406af26 100644 --- a/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java @@ -18,15 +18,10 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.br.*; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import java.io.IOException; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Set; -import java.util.Map; +import org.apache.lucene.analysis.br.BrazilianStemFilter; + +/** Factory for {@link BrazilianStemFilter} */ public class BrazilianStemFilterFactory extends BaseTokenFilterFactory { public BrazilianStemFilter create(TokenStream in) { return new BrazilianStemFilter(in); diff --git a/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java b/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java index 20cd3879a8e..3fec7833a03 100644 --- a/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java +++ b/solr/src/java/org/apache/solr/analysis/BufferedTokenStream.java @@ -73,12 +73,12 @@ public abstract class BufferedTokenStream extends TokenFilter { private final LinkedList inQueue = new LinkedList(); private final LinkedList outQueue = new LinkedList(); - private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); - private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); - private final FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); - private final PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); - private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + private final TermAttribute termAtt = addAttribute(TermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); public BufferedTokenStream(TokenStream input) { super(input); diff --git a/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java index f7f3bb844da..53acdf93c2e 100644 --- a/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java @@ -20,7 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.bg.BulgarianStemFilter; -/** Factory for BulgarianStemFilter */ +/** Factory for {@link BulgarianStemFilter} */ public class BulgarianStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new BulgarianStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java index e68265c4ce9..2e0e7f8933a 100644 --- a/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java @@ -18,11 +18,11 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.cjk.*; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.Tokenizer; + +import org.apache.lucene.analysis.cjk.CJKTokenizer; import java.io.Reader; -import java.util.Map; + +/** Factory for {@link CJKTokenizer} */ public class CJKTokenizerFactory extends BaseTokenizerFactory { public CJKTokenizer create(Reader in) { return new CJKTokenizer(in); diff --git a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java index 1aa6b439398..025dd4fbf20 100644 --- a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java @@ -75,6 +75,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { @Override public void init(Map args) { super.init(args); + assureMatchVersion(); String k = args.get(KEEP); if (k != null) { @@ -84,7 +85,7 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { if ("true".equalsIgnoreCase(ignoreStr)) { ignoreCase = true; } - keep = new CharArraySet(10, ignoreCase); + keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); while (st.hasMoreTokens()) { k = st.nextToken().trim(); keep.add(k.toCharArray()); @@ -194,7 +195,7 @@ class CapitalizationFilter extends TokenFilter { public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) { super(in); this.factory = factory; - this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/ChineseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ChineseFilterFactory.java index 0076220bf38..7e2857c48d3 100644 --- a/solr/src/java/org/apache/solr/analysis/ChineseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ChineseFilterFactory.java @@ -18,10 +18,14 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.cn.*; -import java.util.Hashtable; -import org.apache.lucene.analysis.*; -import java.util.Map; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cn.ChineseFilter; + +/** + * Factory for {@link ChineseFilter} + * @deprecated Use {@link StopFilterFactory} instead. + */ +@Deprecated public class ChineseFilterFactory extends BaseTokenFilterFactory { public ChineseFilter create(TokenStream in) { return new ChineseFilter(in); diff --git a/solr/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java index a817ce000a5..2f0a4e83cc2 100644 --- a/solr/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ChineseTokenizerFactory.java @@ -18,10 +18,15 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.cn.*; + import java.io.Reader; -import org.apache.lucene.analysis.*; -import java.util.Map; +import org.apache.lucene.analysis.cn.ChineseTokenizer; + +/** + * Factory for {@link ChineseTokenizer} + * @deprecated Use {@link StandardTokenizerFactory} instead. + */ +@Deprecated public class ChineseTokenizerFactory extends BaseTokenizerFactory { public ChineseTokenizer create(Reader in) { return new ChineseTokenizer(in); diff --git a/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java b/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java index 9aa551e51e6..0cad27ac8dd 100644 --- a/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java +++ b/solr/src/java/org/apache/solr/analysis/CommonGramsFilter.java @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.Version; /* * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors @@ -51,15 +52,25 @@ public final class CommonGramsFilter extends TokenFilter { private final StringBuilder buffer = new StringBuilder(); - private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); - private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); - private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class); - private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + private final TermAttribute termAttribute = addAttribute(TermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private int lastStartOffset; private boolean lastWasCommon; private State savedState; + /** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */ + public CommonGramsFilter(TokenStream input, Set commonWords) { + this(Version.LUCENE_29, input, commonWords); + } + + /** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */ + public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) { + this(Version.LUCENE_29, input, commonWords, ignoreCase); + } + /** * Construct a token stream filtering the given input using a Set of common * words to create bigrams. Outputs both unigrams with position increment and @@ -69,8 +80,8 @@ public final class CommonGramsFilter extends TokenFilter { * @param input TokenStream input in filter chain * @param commonWords The set of common words. */ - public CommonGramsFilter(TokenStream input, Set commonWords) { - this(input, commonWords, false); + public CommonGramsFilter(Version matchVersion, TokenStream input, Set commonWords) { + this(matchVersion, input, commonWords, false); } /** @@ -90,12 +101,12 @@ public final class CommonGramsFilter extends TokenFilter { * @param commonWords The set of common words. * @param ignoreCase -Ignore case when constructing bigrams for common words. */ - public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) { + public CommonGramsFilter(Version matchVersion, TokenStream input, Set commonWords, boolean ignoreCase) { super(input); if (commonWords instanceof CharArraySet) { this.commonWords = (CharArraySet) commonWords; } else { - this.commonWords = new CharArraySet(commonWords.size(), ignoreCase); + this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase); this.commonWords.addAll(commonWords); } } @@ -106,7 +117,9 @@ public final class CommonGramsFilter extends TokenFilter { * * @param input Tokenstream in filter chain * @param commonWords words to be used in constructing bigrams + * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead. */ + @Deprecated public CommonGramsFilter(TokenStream input, String[] commonWords) { this(input, commonWords, false); } @@ -118,7 +131,9 @@ public final class CommonGramsFilter extends TokenFilter { * @param input Tokenstream in filter chain * @param commonWords words to be used in constructing bigrams * @param ignoreCase -Ignore case when constructing bigrams for common words. + * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead. */ + @Deprecated public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) { super(input); this.commonWords = makeCommonSet(commonWords, ignoreCase); @@ -132,7 +147,9 @@ public final class CommonGramsFilter extends TokenFilter { * @param commonWords Array of common words which will be converted into the CharArraySet * @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor * @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase + * @deprecated create a CharArraySet with CharArraySet instead */ + @Deprecated public static CharArraySet makeCommonSet(String[] commonWords) { return makeCommonSet(commonWords, false); } @@ -145,7 +162,9 @@ public final class CommonGramsFilter extends TokenFilter { * @param commonWords Array of common words which will be converted into the CharArraySet * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words + * @deprecated create a CharArraySet with CharArraySet instead */ + @Deprecated public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) { CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase); commonSet.addAll(Arrays.asList(commonWords)); diff --git a/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java index 6668974190d..3a84708440c 100644 --- a/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java @@ -17,14 +17,12 @@ package org.apache.solr.analysis; import java.io.IOException; -import java.util.List; import java.util.Set; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; /** @@ -43,16 +41,7 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements if (commonWordFiles != null) { try { - List files = StrUtils.splitFileNames(commonWordFiles); - if (commonWords == null && files.size() > 0){ - //default stopwords list has 35 or so words, but maybe don't make it that big to start - commonWords = new CharArraySet(files.size() * 10, ignoreCase); - } - for (String file : files) { - List wlist = loader.getLines(file.trim()); - //TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call - commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase)); - } + commonWords = getWordSet(loader, commonWordFiles, ignoreCase); } catch (IOException e) { throw new RuntimeException(e); } @@ -69,12 +58,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements return ignoreCase; } - public Set getCommonWords() { + public Set getCommonWords() { return commonWords; } public CommonGramsFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase); + CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase); return commonGrams; } } diff --git a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java index bbd63e1d181..303026dbadc 100644 --- a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java +++ b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java @@ -47,8 +47,8 @@ import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE; */ public final class CommonGramsQueryFilter extends TokenFilter { - private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class); - private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private State previous; private String previousType; diff --git a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java index 6e05dc3a230..7f61e1de029 100644 --- a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java @@ -17,14 +17,13 @@ package org.apache.solr.analysis; import java.io.IOException; -import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; /** @@ -36,25 +35,19 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + public void inform(ResourceLoader loader) { String commonWordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); if (commonWordFiles != null) { try { - List files = StrUtils.splitFileNames(commonWordFiles); - if (commonWords == null && files.size() > 0) { - // default stopwords list has 35 or so words, but maybe don't make it - // that big to start - commonWords = new CharArraySet(files.size() * 10, ignoreCase); - } - for (String file : files) { - List wlist = loader.getLines(file.trim()); - // TODO: once StopFilter.makeStopSet(List) method is available, switch - // to using that so we can avoid a toArray() call - commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist - .toArray(new String[0]), ignoreCase)); - } + commonWords = getWordSet(loader, commonWordFiles, ignoreCase); } catch (IOException e) { throw new RuntimeException(e); } @@ -73,7 +66,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory return ignoreCase; } - public Set getCommonWords() { + public Set getCommonWords() { return commonWords; } @@ -81,7 +74,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter */ public CommonGramsQueryFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, + CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase); CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( commonGrams); diff --git a/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java index dec6be3ee24..f3f889f146d 100644 --- a/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java @@ -20,7 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cz.CzechStemFilter; -/** Factory for CzechStemFilter */ +/** Factory for {@link CzechStemFilter} */ public class CzechStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new CzechStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java index 51cb85ab8ed..69127a5712c 100644 --- a/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java @@ -31,7 +31,7 @@ import java.util.Map; /** * - * + * Factory for {@link DelimitedPayloadTokenFilter} **/ public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public static final String ENCODER_ATTR = "encoder"; diff --git a/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java index e6700c80fbc..64a0434d798 100644 --- a/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java @@ -18,20 +18,18 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.compound.*; import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.SolrException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import java.util.List; -import java.util.Set; import java.util.Map; import java.io.IOException; +/** Factory for {@link DictionaryCompoundWordTokenFilter} */ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { - private Set dictionary; + private CharArraySet dictionary; private String dictFile; private int minWordSize; private int minSubwordSize; @@ -39,6 +37,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac private boolean onlyLongestMatch; public void init(Map args) { super.init(args); + assureMatchVersion(); dictFile = args.get("dictionary"); if (null == dictFile) { throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, @@ -52,14 +51,13 @@ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFac } public void inform(ResourceLoader loader) { try { - List wlist = loader.getLines(dictFile); - dictionary = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false); + dictionary = super.getWordSet(loader, dictFile, false); } catch (IOException e) { throw new RuntimeException(e); } } public DictionaryCompoundWordTokenFilter create(TokenStream input) { - return new DictionaryCompoundWordTokenFilter(input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch); + return new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch); } } diff --git a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java index 450dc1f6eb9..3b8ed14cfe1 100644 --- a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java +++ b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java @@ -20,11 +20,9 @@ import java.io.IOException; import java.util.LinkedList; import org.apache.commons.codec.language.DoubleMetaphone; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; public class DoubleMetaphoneFilter extends TokenFilter { @@ -41,8 +39,8 @@ public class DoubleMetaphoneFilter extends TokenFilter { super(input); this.encoder.setMaxCodeLen(maxCodeLength); this.inject = inject; - this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); - this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); + this.posAtt = addAttribute(PositionIncrementAttribute.class); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java index 8dfb8bbab5f..77b74c1e5ba 100644 --- a/solr/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DutchStemFilterFactory.java @@ -18,19 +18,19 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.nl.*; -import org.apache.lucene.analysis.Token; + +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; -import java.util.Map; -import java.util.Map; + +/** + * @deprecated Use {@link SnowballPorterFilterFactory} with "Dutch" instead, + * which has the same functionality. + */ +@Deprecated public class DutchStemFilterFactory extends BaseTokenFilterFactory { - public DutchStemFilter create(TokenStream _in) { - return new DutchStemFilter(_in); + public TokenFilter create(TokenStream _in) { + return new SnowballFilter(_in, new org.tartarus.snowball.ext.DutchStemmer()); } } diff --git a/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java index a3350f00640..7f7ef91aa3c 100644 --- a/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java @@ -21,32 +21,22 @@ package org.apache.solr.analysis; import org.apache.solr.common.ResourceLoader; import org.apache.solr.util.plugin.ResourceLoaderAware; -import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.fr.*; import java.io.IOException; -import java.util.Set; -import java.util.HashSet; -import java.util.Arrays; -import java.util.Iterator; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenFilter; -import java.util.Map; -import java.util.List; -import java.util.Set; -import java.io.IOException; +/** Factory for {@link ElisionFilter} */ public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { - private Set articles; + private CharArraySet articles; public void inform(ResourceLoader loader) { String articlesFile = args.get("articles"); if (articlesFile != null) { try { - List wlist = loader.getLines(articlesFile); - articles = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]), false); + articles = getWordSet(loader, articlesFile, false); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/solr/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java b/solr/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java index 4dc5a9c8da5..e185e4d298b 100644 --- a/solr/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java @@ -18,17 +18,14 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; -import org.tartarus.snowball.SnowballProgram; import java.io.IOException; -import java.io.File; -import java.util.List; /** * @version $Id$ @@ -42,21 +39,7 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement String wordFiles = args.get(PROTECTED_TOKENS); if (wordFiles != null) { try { - File protectedWordFiles = new File(wordFiles); - if (protectedWordFiles.exists()) { - List wlist = loader.getLines(wordFiles); - //This cast is safe in Lucene - protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally - } else { - List files = StrUtils.splitFileNames(wordFiles); - for (String file : files) { - List wlist = loader.getLines(file.trim()); - if (protectedWords == null) - protectedWords = new CharArraySet(wlist, false); - else - protectedWords.addAll(wlist); - } - } + protectedWords = getWordSet(loader, wordFiles, false); } catch (IOException e) { throw new RuntimeException(e); } @@ -65,20 +48,10 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement private CharArraySet protectedWords = null; - public EnglishPorterFilter create(TokenStream input) { - return new EnglishPorterFilter(input, protectedWords); + public TokenFilter create(TokenStream input) { + if (protectedWords != null) + input = new KeywordMarkerTokenFilter(input, protectedWords); + return new SnowballFilter(input, new org.tartarus.snowball.ext.EnglishStemmer()); } } - - -/** - * English Porter2 filter that doesn't use reflection to - * adapt lucene to the snowball stemmer code. - */ -@Deprecated -class EnglishPorterFilter extends SnowballPorterFilter { - public EnglishPorterFilter(TokenStream source, CharArraySet protWords) { - super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords); - } -} diff --git a/solr/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java index 42a92d73dfb..cef735f71a8 100644 --- a/solr/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/FrenchStemFilterFactory.java @@ -18,18 +18,19 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.fr.*; -import org.apache.lucene.analysis.Token; + +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import java.io.IOException; -import java.util.Hashtable; -import java.util.HashSet; -import java.util.Set; -import java.util.Map; + +/** + * @deprecated Use {@link SnowballPorterFilterFactory} with "French" instead, + * which has the same functionality. + */ +@Deprecated public class FrenchStemFilterFactory extends BaseTokenFilterFactory { - public FrenchStemFilter create(TokenStream in) { - return new FrenchStemFilter(in); + public TokenFilter create(TokenStream in) { + return new SnowballFilter(in, new org.tartarus.snowball.ext.FrenchStemmer()); } } diff --git a/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java index 109c1030354..2c85c44108d 100644 --- a/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java @@ -18,13 +18,11 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.de.*; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.analysis.de.GermanStemFilter; import org.apache.lucene.analysis.TokenStream; -import java.io.IOException; -import java.util.Set; -import java.util.Map; + +/** Factory for {@link GermanStemFilter} */ public class GermanStemFilterFactory extends BaseTokenFilterFactory { public GermanStemFilter create(TokenStream in) { return new GermanStemFilter(in); diff --git a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java index 05490ac7acd..61fc2c06ad1 100644 --- a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.el.GreekLowerCaseFilter; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +/** Factory for {@link GreekLowerCaseFilter} */ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java index e45554999a6..50946fb02ed 100644 --- a/solr/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java @@ -20,7 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.hi.HindiNormalizationFilter; -/** Factory for HindiNormalizationFilter */ +/** Factory for {@link HindiNormalizationFilter} */ public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new HindiNormalizationFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java index 43deee6e6db..406343e021e 100644 --- a/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java @@ -20,7 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.hi.HindiStemFilter; -/** Factory for HindiStemFilter */ +/** Factory for {@link HindiStemFilter} */ public class HindiStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new HindiStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java index 428bd68e2b5..aadfc682d80 100755 --- a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java +++ b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilter.java @@ -54,8 +54,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; */ public final class HyphenatedWordsFilter extends TokenFilter { - private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); - private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); + private final TermAttribute termAttribute = addAttribute(TermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final StringBuilder hyphenated = new StringBuilder(); private State savedState; diff --git a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java index bf970254f78..e6923cddc0a 100755 --- a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java @@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.solr.analysis.BaseTokenFilterFactory; /** - * Factory for HyphenatedWordsFilter + * Factory for {@link HyphenatedWordsFilter} */ public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory { public HyphenatedWordsFilter create(TokenStream input) { diff --git a/solr/src/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java index 8181eddf615..807be54abd8 100644 --- a/solr/src/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java @@ -21,8 +21,10 @@ import org.apache.lucene.analysis.ISOLatin1AccentFilter; import org.apache.lucene.analysis.TokenStream; /** Factory for ISOLatin1AccentFilter + * @deprecated Use {@link ASCIIFoldingFilterFactory} instead. * $Id$ */ +@Deprecated public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory { public ISOLatin1AccentFilter create(TokenStream input) { return new ISOLatin1AccentFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java index a72f9c0d57e..84264dd8ed8 100644 --- a/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java @@ -20,7 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.in.IndicNormalizationFilter; -/** Factory for IndicNormalizationFilter */ +/** Factory for {@link IndicNormalizationFilter} */ public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new IndicNormalizationFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java index 4a51f257929..0da0c7cb35a 100644 --- a/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java @@ -22,7 +22,7 @@ import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.in.IndicTokenizer; -/** Factory for IndicTokenizer */ +/** Factory for {@link IndicTokenizer} */ public class IndicTokenizerFactory extends BaseTokenizerFactory { public Tokenizer create(Reader input) { assureMatchVersion(); diff --git a/solr/src/java/org/apache/solr/analysis/KeepWordFilter.java b/solr/src/java/org/apache/solr/analysis/KeepWordFilter.java index 5ac3585bb7b..ca26532a67d 100644 --- a/solr/src/java/org/apache/solr/analysis/KeepWordFilter.java +++ b/solr/src/java/org/apache/solr/analysis/KeepWordFilter.java @@ -19,10 +19,8 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.tartarus.snowball.SnowballProgram; import java.io.IOException; import java.util.Set; @@ -38,6 +36,8 @@ public final class KeepWordFilter extends TokenFilter { private final CharArraySet words; private final TermAttribute termAtt; + /** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */ + @Deprecated public KeepWordFilter(TokenStream in, Set words, boolean ignoreCase ) { this(in, new CharArraySet(words, ignoreCase)); } @@ -47,7 +47,7 @@ public final class KeepWordFilter extends TokenFilter { public KeepWordFilter(TokenStream in, CharArraySet words) { super(in); this.words = words; - this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java index a2a12db67dd..ab170bd653f 100644 --- a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java @@ -18,17 +18,11 @@ package org.apache.solr.analysis; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; -import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; -import java.util.HashSet; -import java.util.List; import java.util.Set; -import java.io.File; -import java.io.File; import java.io.IOException; /** @@ -40,23 +34,13 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res private CharArraySet words; private boolean ignoreCase; - @SuppressWarnings("unchecked") public void inform(ResourceLoader loader) { String wordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); - if (wordFiles != null) { + if (wordFiles != null) { try { - List files = StrUtils.splitFileNames(wordFiles); - if (words == null && files.size() > 0){ - words = new CharArraySet(files.size() * 10, ignoreCase); - } - for (String file : files) { - List wlist = loader.getLines(file.trim()); - //TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call - words.addAll(StopFilter.makeStopSet((String[]) wlist.toArray(new String[0]), ignoreCase)); - } - } - catch (IOException e) { + words = getWordSet(loader, wordFiles, ignoreCase); + } catch (IOException e) { throw new RuntimeException(e); } } @@ -67,14 +51,14 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res * NOTE: if ignoreCase==true, the words are expected to be lowercase */ public void setWords(Set words) { - this.words = new CharArraySet(words, ignoreCase); + this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase); } - public void setIgnoreCase(boolean ignoreCase) { - this.ignoreCase = ignoreCase; - if (words != null) { - words = new CharArraySet(words, ignoreCase); + public void setIgnoreCase(boolean ignoreCase) { + if (words != null && this.ignoreCase != ignoreCase) { + words = new CharArraySet(luceneMatchVersion, words, ignoreCase); } + this.ignoreCase = ignoreCase; } public KeepWordFilter create(TokenStream input) { diff --git a/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java b/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java new file mode 100644 index 00000000000..59ad0d12f6d --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java @@ -0,0 +1,55 @@ +package org.apache.solr.analysis; + +import java.io.IOException; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Factory for {@link KeywordMarkerTokenFilter} + */ +public class KeywordMarkerFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + private CharArraySet protectedWords; + private boolean ignoreCase; + + public void inform(ResourceLoader loader) { + String wordFiles = args.get(PROTECTED_TOKENS); + ignoreCase = getBoolean("ignoreCase", false); + if (wordFiles != null) { + try { + protectedWords = getWordSet(loader, wordFiles, ignoreCase); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public TokenStream create(TokenStream input) { + return protectedWords == null ? input : new KeywordMarkerTokenFilter(input, protectedWords); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java index 8b9318a9889..1edaf2e51d3 100644 --- a/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java @@ -17,7 +17,6 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.KeywordTokenizer; import java.io.Reader; diff --git a/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java index 84be9c372f9..09f34dc046d 100644 --- a/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java @@ -17,17 +17,23 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.LetterTokenizer; import java.io.Reader; +import java.util.Map; /** * @version $Id$ */ public class LetterTokenizerFactory extends BaseTokenizerFactory { - public LetterTokenizer create(Reader input) { + + @Override + public void init(Map args) { + super.init(args); assureMatchVersion(); + } + + public LetterTokenizer create(Reader input) { return new LetterTokenizer(luceneMatchVersion, input); } } diff --git a/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java index 20a67cb0337..67af3ce0d7f 100644 --- a/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java @@ -17,6 +17,8 @@ package org.apache.solr.analysis; +import java.util.Map; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.LowerCaseFilter; @@ -24,8 +26,13 @@ import org.apache.lucene.analysis.LowerCaseFilter; * @version $Id$ */ public class LowerCaseFilterFactory extends BaseTokenFilterFactory { - public LowerCaseFilter create(TokenStream input) { + @Override + public void init(Map args) { + super.init(args); assureMatchVersion(); + } + + public LowerCaseFilter create(TokenStream input) { return new LowerCaseFilter(luceneMatchVersion,input); } } diff --git a/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java index b92441c5864..5c45c8a2580 100644 --- a/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java @@ -17,17 +17,22 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.LowerCaseTokenizer; import java.io.Reader; +import java.util.Map; /** * @version $Id$ */ public class LowerCaseTokenizerFactory extends BaseTokenizerFactory { - public LowerCaseTokenizer create(Reader input) { + @Override + public void init(Map args) { + super.init(args); assureMatchVersion(); + } + + public LowerCaseTokenizer create(Reader input) { return new LowerCaseTokenizer(luceneMatchVersion,input); } } diff --git a/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java index 880f83f8de0..69469eab54e 100644 --- a/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java @@ -18,13 +18,12 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.payloads.*; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.index.Payload; -import java.io.IOException; import java.util.Map; + +/** Factory for {@link NumericPayloadTokenFilter} */ public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory { private float payload; private String typeMatch; diff --git a/solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java b/solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java index dc4029e2a67..841f94e93d6 100644 --- a/solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java +++ b/solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java @@ -19,13 +19,10 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.util.regex.Pattern; import java.util.regex.Matcher; -import java.util.Set; import java.io.IOException; import java.nio.CharBuffer; @@ -66,7 +63,7 @@ public final class PatternReplaceFilter extends TokenFilter { this.p=p; this.replacement = (null == replacement) ? "" : replacement; this.all=all; - this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/PatternTokenizer.java b/solr/src/java/org/apache/solr/analysis/PatternTokenizer.java index 88d9d1bb66b..9253e936f02 100644 --- a/solr/src/java/org/apache/solr/analysis/PatternTokenizer.java +++ b/solr/src/java/org/apache/solr/analysis/PatternTokenizer.java @@ -56,8 +56,8 @@ import org.apache.commons.io.IOUtils; */ public final class PatternTokenizer extends Tokenizer { - private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); - private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + private final TermAttribute termAtt = addAttribute(TermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private String str; private int index; diff --git a/solr/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java index 981f8f18113..7e095460215 100644 --- a/solr/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java @@ -18,12 +18,11 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.fa.*; -import java.io.IOException; -import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import java.util.Map; + +/** Factory for {@link PersianNormalizationFilter} */ public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory { public PersianNormalizationFilter create(TokenStream input) { return new PersianNormalizationFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/PhoneticFilter.java b/solr/src/java/org/apache/solr/analysis/PhoneticFilter.java index c097cfa7a21..dcf6d8d63cc 100644 --- a/solr/src/java/org/apache/solr/analysis/PhoneticFilter.java +++ b/solr/src/java/org/apache/solr/analysis/PhoneticFilter.java @@ -20,7 +20,6 @@ package org.apache.solr.analysis; import org.apache.commons.codec.Encoder; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -47,8 +46,8 @@ public class PhoneticFilter extends TokenFilter this.encoder = encoder; this.name = name; this.inject = inject; - this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); - this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); + this.posAtt = addAttribute(PositionIncrementAttribute.class); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java b/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java index f030c2baa1c..563356c70af 100644 --- a/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java +++ b/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java @@ -17,11 +17,12 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.solr.util.CharArrayMap; +import org.apache.lucene.util.Version; import java.io.IOException; @@ -30,12 +31,11 @@ import java.io.IOException; */ public final class RemoveDuplicatesTokenFilter extends TokenFilter { - private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); - private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + private final TermAttribute termAttribute = addAttribute(TermAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); - // keep a seen 'set' after each term with posInc > 0 - // for now use CharArrayMap vs CharArraySet, as it has clear() - private final CharArrayMap previous = new CharArrayMap(8, false); + // use a fixed version, as we don't care about case sensitivity. + private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false); /** * Creates a new RemoveDuplicatesTokenFilter @@ -60,12 +60,12 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter { previous.clear(); } - boolean duplicate = (posIncrement == 0 && previous.get(term, 0, length) != null); + boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length)); // clone the term, and add to the set of seen terms. char saved[] = new char[length]; System.arraycopy(term, 0, saved, 0, length); - previous.put(saved, Boolean.TRUE); + previous.add(saved); if (!duplicate) { return true; diff --git a/solr/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java b/solr/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java index 99911843b6f..d820ec4040d 100644 --- a/solr/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java +++ b/solr/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java @@ -45,8 +45,8 @@ public class ReversedWildcardFilter extends TokenFilter { protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) { super(input); - this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); - this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); + this.posAtt = addAttribute(PositionIncrementAttribute.class); this.withOriginal = withOriginal; this.markerChar = markerChar; } diff --git a/solr/src/java/org/apache/solr/analysis/RussianCommon.java b/solr/src/java/org/apache/solr/analysis/RussianCommon.java deleted file mode 100644 index 9bdedd4abf7..00000000000 --- a/solr/src/java/org/apache/solr/analysis/RussianCommon.java +++ /dev/null @@ -1,61 +0,0 @@ - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//package org.apache.solr.analysis; -//import org.apache.lucene.analysis.ru.*; -//import java.util.Map; -//import java.util.HashMap; -//import org.apache.solr.core.SolrConfig; -//import org.apache.solr.common.SolrException; -//import org.apache.solr.common.SolrException.ErrorCode; -//import org.slf4j.Logger; -//import org.slf4j.LoggerFactory; -// -//@Deprecated -//public class RussianCommon { -// -// private static Logger logger = LoggerFactory.getLogger(RussianCommon.class); -// -// private static Map CHARSETS = new HashMap(); -// static { -// CHARSETS.put("UnicodeRussian",RussianCharsets.UnicodeRussian); -// CHARSETS.put("KOI8",RussianCharsets.KOI8); -// CHARSETS.put("CP1251",RussianCharsets.CP1251); -// } -// -// public static char[] getCharset(String name) { -// if (null == name) -// return RussianCharsets.UnicodeRussian; -// -// char[] charset = CHARSETS.get(name); -// -// if (charset.equals(RussianCharsets.UnicodeRussian)) -// logger.warn("Specifying UnicodeRussian is no longer required (default). " -// + "Use of the charset parameter will cause an error in Solr 1.5"); -// else -// logger.warn("Support for this custom encoding is deprecated. " -// + "Use of the charset parameter will cause an error in Solr 1.5"); -// -// if (null == charset) { -// throw new SolrException(ErrorCode.SERVER_ERROR, -// "Don't understand charset: " + name); -// } -// return charset; -// } -//} - diff --git a/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java index cd4b670ef33..fd0ab7e3d96 100644 --- a/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java @@ -24,6 +24,10 @@ import org.apache.lucene.analysis.ru.RussianLetterTokenizer; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +/** @deprecated Use {@link StandardTokenizerFactory} instead. + * This tokenizer has no Russian-specific functionality. + */ +@Deprecated public class RussianLetterTokenizerFactory extends BaseTokenizerFactory { @Override diff --git a/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java index a7f375e51c6..d240c366e41 100644 --- a/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java @@ -19,11 +19,17 @@ package org.apache.solr.analysis; import java.util.Map; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ru.RussianLowerCaseFilter; +import org.apache.lucene.util.Version; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +/** @deprecated Use {@link LowerCaseFilterFactory} instead which has the + * same functionality. + */ +@Deprecated public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory { @Override @@ -35,8 +41,9 @@ public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory { + "Please process your documents as Unicode instead."); } - public RussianLowerCaseFilter create(TokenStream in) { - return new RussianLowerCaseFilter(in); + public TokenFilter create(TokenStream in) { + // hardcode the version to give exactly the old behavior + return new LowerCaseFilter(Version.LUCENE_29, in); } } diff --git a/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java index fb679aef4f1..be8423378c7 100644 --- a/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/RussianStemFilterFactory.java @@ -19,16 +19,19 @@ package org.apache.solr.analysis; -import java.util.Map; - +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ru.RussianStemFilter; +import org.apache.lucene.analysis.snowball.SnowballFilter; +/** + * @deprecated Use {@link SnowballPorterFilterFactory} with "Russian" instead, + * which has the same functionality. + */ +@Deprecated public class RussianStemFilterFactory extends BaseTokenFilterFactory { - - public RussianStemFilter create(TokenStream in) { - return new RussianStemFilter(in); + public TokenFilter create(TokenStream in) { + return new SnowballFilter(in, new org.tartarus.snowball.ext.RussianStemmer()); } } diff --git a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java index 6339e5c0fd0..9ebff5a8d12 100644 --- a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java @@ -18,14 +18,12 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.shingle.*; -import java.io.IOException; -import java.util.LinkedList; -import java.util.Iterator; -import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import java.util.Map; + +/** Factory for {@link ShingleFilter} */ public class ShingleFilterFactory extends BaseTokenFilterFactory { private int maxShingleSize; private boolean outputUnigrams; diff --git a/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java index 1230d4581d2..852eee08387 100644 --- a/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java @@ -17,26 +17,21 @@ package org.apache.solr.analysis; import java.util.Map; -import java.util.List; -import java.io.File; import java.io.IOException; +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; import org.tartarus.snowball.SnowballProgram; /** - * Factory for SnowballFilters, with configurable language - * - * Browsing the code, SnowballFilter uses reflection to adapt to Lucene... don't - * use this if you are concerned about speed. Use EnglishPorterFilterFactory. + * Factory for {@link SnowballFilter}, with configurable language + *

+ * Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection. * * @version $Id$ */ @@ -44,28 +39,14 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen public static final String PROTECTED_TOKENS = "protected"; private String language = "English"; - private Class stemClass; + private Class stemClass; public void inform(ResourceLoader loader) { String wordFiles = args.get(PROTECTED_TOKENS); if (wordFiles != null) { try { - File protectedWordFiles = new File(wordFiles); - if (protectedWordFiles.exists()) { - List wlist = loader.getLines(wordFiles); - //This cast is safe in Lucene - protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally - } else { - List files = StrUtils.splitFileNames(wordFiles); - for (String file : files) { - List wlist = loader.getLines(file.trim()); - if (protectedWords == null) - protectedWords = new CharArraySet(wlist, false); - else - protectedWords.addAll(wlist); - } - } + protectedWords = getWordSet(loader, wordFiles, false); } catch (IOException e) { throw new RuntimeException(e); } @@ -87,50 +68,17 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen } } - public SnowballPorterFilter create(TokenStream input) { + public TokenFilter create(TokenStream input) { SnowballProgram program; try { program = (SnowballProgram)stemClass.newInstance(); } catch (Exception e) { throw new RuntimeException("Error instantiating stemmer for language " + language + "from class " +stemClass, e); } - return new SnowballPorterFilter(input, program, protectedWords); + + if (protectedWords != null) + input = new KeywordMarkerTokenFilter(input, protectedWords); + return new SnowballFilter(input, program); } } - -class SnowballPorterFilter extends TokenFilter { - private final CharArraySet protWords; - private final SnowballProgram stemmer; - private final TermAttribute termAtt; - - public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) { - super(source); - this.protWords = protWords; - this.stemmer = stemmer; - this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); - } - - @Override - public boolean incrementToken() throws IOException { - if (!input.incrementToken()) return false; - - char[] termBuffer = termAtt.termBuffer(); - int len = termAtt.termLength(); - // if protected, don't stem. use this to avoid stemming collisions. - if (protWords != null && protWords.contains(termBuffer, 0, len)) { - return true; - } - - stemmer.setCurrent(termBuffer, len); - stemmer.stem(); - final char finalTerm[] = stemmer.getCurrentBuffer(); - final int newLength = stemmer.getCurrentBufferLength(); - if (finalTerm != termBuffer) - termAtt.setTermBuffer(finalTerm, 0, newLength); - else - termAtt.setTermLength(newLength); - - return true; - } -} diff --git a/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java index f59924d6704..e7e27eed9dd 100644 --- a/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java @@ -17,18 +17,23 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.Reader; +import java.util.Map; /** * @version $Id$ */ public class StandardTokenizerFactory extends BaseTokenizerFactory { - public StandardTokenizer create(Reader input) { + @Override + public void init(Map args) { + super.init(args); assureMatchVersion(); + } + + public StandardTokenizer create(Reader input) { return new StandardTokenizer(luceneMatchVersion, input); } } diff --git a/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java b/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java new file mode 100644 index 00000000000..da39537fc4a --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java @@ -0,0 +1,68 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.CharArrayMap; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +/** + * Factory for {@link StemmerOverrideFilter} + */ +public class StemmerOverrideFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private CharArrayMap dictionary = null; + private boolean ignoreCase; + + public void inform(ResourceLoader loader) { + String dictionaryFiles = args.get("dictionary"); + ignoreCase = getBoolean("ignoreCase", false); + if (dictionaryFiles != null) { + assureMatchVersion(); + List files = StrUtils.splitFileNames(dictionaryFiles); + try { + if (files.size() > 0) { + dictionary = new CharArrayMap(luceneMatchVersion, + files.size() * 10, ignoreCase); + for (String file : files) { + List list = loader.getLines(file.trim()); + for (String line : list) { + String[] mapping = line.split("\t", 2); + dictionary.put(mapping[0], mapping[1]); + } + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public TokenStream create(TokenStream input) { + return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java b/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java index 335876f814a..563d5aa4de3 100644 --- a/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/StopFilterFactory.java @@ -18,18 +18,14 @@ package org.apache.solr.analysis; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; -import java.util.HashSet; -import java.util.List; -import java.io.File; +import java.util.Map; import java.util.Set; -import java.io.File; import java.io.IOException; /** @@ -37,6 +33,12 @@ import java.io.IOException; */ public class StopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + public void inform(ResourceLoader loader) { String stopWordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase",false); @@ -44,20 +46,12 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc if (stopWordFiles != null) { try { - List files = StrUtils.splitFileNames(stopWordFiles); - if (stopWords == null && files.size() > 0){ - //default stopwords list has 35 or so words, but maybe don't make it that big to start - stopWords = new CharArraySet(files.size() * 10, ignoreCase); - } - for (String file : files) { - List wlist = loader.getLines(file.trim()); - stopWords.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); - } + stopWords = getWordSet(loader, stopWordFiles, ignoreCase); } catch (IOException e) { throw new RuntimeException(e); } } else { - stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); + stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } } @@ -78,7 +72,6 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc } public StopFilter create(TokenStream input) { - assureMatchVersion(); StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase); stopFilter.setEnablePositionIncrements(enablePositionIncrements); return stopFilter; diff --git a/solr/src/java/org/apache/solr/analysis/SynonymFilter.java b/solr/src/java/org/apache/solr/analysis/SynonymFilter.java index a8be4496e8b..4a7db36db95 100644 --- a/solr/src/java/org/apache/solr/analysis/SynonymFilter.java +++ b/solr/src/java/org/apache/solr/analysis/SynonymFilter.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; import java.io.IOException; @@ -50,7 +49,7 @@ public class SynonymFilter extends TokenFilter { public SynonymFilter(TokenStream in, SynonymMap map) { super(in); this.map = map; - // just ensuring these exist attributes exist... + // just ensuring these attributes exist... addAttribute(TermAttribute.class); addAttribute(PositionIncrementAttribute.class); addAttribute(OffsetAttribute.class); @@ -88,7 +87,7 @@ public class SynonymFilter extends TokenFilter { // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) return false; - TermAttribute termAtt = (TermAttribute) firstTok.addAttribute(TermAttribute.class); + TermAttribute termAtt = firstTok.addAttribute(TermAttribute.class); SynonymMap result = map.submap!=null ? map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()) : null; if (result == null) { copy(this, firstTok); @@ -121,7 +120,7 @@ public class SynonymFilter extends TokenFilter { boolean includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; - PositionIncrementAttribute firstPosIncAtt = (PositionIncrementAttribute) firstTok.addAttribute(PositionIncrementAttribute.class); + PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class); int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream int repPos=0; // curr position in replacement token stream int pos=0; // current position in merged token stream @@ -129,12 +128,11 @@ public class SynonymFilter extends TokenFilter { for (int i=0; i1, should not match, if==0, check multiple at this level? - TermAttribute termAtt = (TermAttribute) tok.getAttribute(TermAttribute.class); + TermAttribute termAtt = tok.getAttribute(TermAttribute.class); SynonymMap subMap = map.submap.get(termAtt.termBuffer(), 0, termAtt.termLength()); if (subMap != null) { @@ -243,12 +241,8 @@ public class SynonymFilter extends TokenFilter { } private void copy(AttributeSource target, AttributeSource source) { - if (target == source) - return; - for (Iterator sourceIt = source.getAttributeImplsIterator(), targetIt=target.getAttributeImplsIterator(); - sourceIt.hasNext();) { - sourceIt.next().copyTo(targetIt.next()); - } + if (target != source) + source.copyTo(target); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java index 7d1c18dda86..049143172a4 100644 --- a/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java @@ -17,7 +17,6 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.solr.common.ResourceLoader; @@ -136,7 +135,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso TokenStream ts = loadTokenizer(tokFactory, reader); List tokList = new ArrayList(); try { - TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()){ String text = new String(termAtt.termBuffer(), 0, termAtt.termLength()); if( text.length() > 0 ) diff --git a/solr/src/java/org/apache/solr/analysis/SynonymMap.java b/solr/src/java/org/apache/solr/analysis/SynonymMap.java index 2e8bec5052b..b72763378ad 100644 --- a/solr/src/java/org/apache/solr/analysis/SynonymMap.java +++ b/solr/src/java/org/apache/solr/analysis/SynonymMap.java @@ -17,8 +17,9 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.CharArrayMap; import org.apache.lucene.analysis.Token; -import org.apache.solr.util.CharArrayMap; +import org.apache.lucene.util.Version; import java.util.*; @@ -52,7 +53,9 @@ public class SynonymMap { SynonymMap currMap = this; for (String str : singleMatch) { if (currMap.submap==null) { - currMap.submap = new CharArrayMap(1, ignoreCase()); + // for now hardcode at 2.9, as its what the old code did. + // would be nice to fix, but shouldn't store a version in each submap!!! + currMap.submap = new CharArrayMap(Version.LUCENE_29, 1, ignoreCase()); } SynonymMap map = currMap.submap.get(str); @@ -68,7 +71,7 @@ public class SynonymMap { if (currMap.synonyms != null && !mergeExisting) { throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch); } - List superset = currMap.synonyms==null ? replacement : + List superset = currMap.synonyms==null ? replacement : mergeTokens(Arrays.asList(currMap.synonyms), replacement); currMap.synonyms = (Token[])superset.toArray(new Token[superset.size()]); if (includeOrig) currMap.flags |= INCLUDE_ORIG; diff --git a/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java index 670f2dbde53..c6f8a837357 100644 --- a/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java @@ -18,15 +18,11 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.th.*; -import java.io.IOException; -import java.util.Locale; -import java.lang.Character.UnicodeBlock; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.th.ThaiWordFilter; + import org.apache.lucene.analysis.TokenStream; -import java.text.BreakIterator; -import java.util.Map; + +/** Factory for {@link ThaiWordFilter} */ public class ThaiWordFilterFactory extends BaseTokenFilterFactory { public ThaiWordFilter create(TokenStream input) { return new ThaiWordFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java index 16fbd2fd0b5..729e33424b8 100644 --- a/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java @@ -18,13 +18,11 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.payloads.*; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.index.Payload; -import java.io.IOException; -import java.util.Map; + +/** Factory for {@link TokenOffsetPayloadTokenFilter} */ public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory { public TokenOffsetPayloadTokenFilter create(TokenStream input) { return new TokenOffsetPayloadTokenFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/TokenizerChain.java b/solr/src/java/org/apache/solr/analysis/TokenizerChain.java index 4dfbfc4ae77..3f92f256d80 100644 --- a/solr/src/java/org/apache/solr/analysis/TokenizerChain.java +++ b/solr/src/java/org/apache/solr/analysis/TokenizerChain.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Tokenizer; import java.io.Reader; -import java.io.IOException; /** * @version $Id$ diff --git a/solr/src/java/org/apache/solr/analysis/TokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/TokenizerFactory.java index f9ef4152f7e..fe248566758 100644 --- a/solr/src/java/org/apache/solr/analysis/TokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TokenizerFactory.java @@ -19,7 +19,6 @@ package org.apache.solr.analysis; import java.io.*; import java.util.Map; -import org.apache.solr.core.SolrConfig; import org.apache.lucene.analysis.*; diff --git a/solr/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java index 24aaf0cff98..aec963c8bf2 100644 --- a/solr/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java @@ -16,7 +16,6 @@ */ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.solr.common.SolrException; diff --git a/solr/src/java/org/apache/solr/analysis/TrimFilter.java b/solr/src/java/org/apache/solr/analysis/TrimFilter.java index fdad57daed6..821fc27f7b2 100644 --- a/solr/src/java/org/apache/solr/analysis/TrimFilter.java +++ b/solr/src/java/org/apache/solr/analysis/TrimFilter.java @@ -17,7 +17,6 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -41,8 +40,8 @@ public final class TrimFilter extends TokenFilter { super(in); this.updateOffsets = updateOffsets; - this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); - this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + this.termAtt = addAttribute(TermAttribute.class); + this.offsetAtt = addAttribute(OffsetAttribute.class); } @Override diff --git a/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java index 6b0f38f9df5..44b70eaabcb 100644 --- a/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java @@ -20,7 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; -/** Factory for TurkishLowerCaseFilter */ +/** Factory for {@link TurkishLowerCaseFilter} */ public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new TurkishLowerCaseFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java index 23efa386983..8fa00caba6e 100644 --- a/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java @@ -18,13 +18,11 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.payloads.*; -import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.index.Payload; -import java.io.IOException; -import java.util.Map; + +/** Factory for {@link TypeAsPayloadTokenFilter} */ public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory { public TypeAsPayloadTokenFilter create(TokenStream input) { return new TypeAsPayloadTokenFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java index 63bf3edda76..c752188fadc 100644 --- a/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java @@ -17,17 +17,22 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import java.io.Reader; +import java.util.Map; /** * @version $Id$ */ public class WhitespaceTokenizerFactory extends BaseTokenizerFactory { - public WhitespaceTokenizer create(Reader input) { + @Override + public void init(Map args) { + super.init(args); assureMatchVersion(); + } + + public WhitespaceTokenizer create(Reader input) { return new WhitespaceTokenizer(luceneMatchVersion,input); } } diff --git a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index 8d35f1d6406..f5963f873bc 100644 --- a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -120,10 +120,10 @@ final class WordDelimiterFilter extends TokenFilter { */ final CharArraySet protWords; - private final TermAttribute termAttribute = (TermAttribute) addAttribute(TermAttribute.class); - private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); - private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class); + private final TermAttribute termAttribute = addAttribute(TermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); // used for iterating word delimiter breaks private final WordDelimiterIterator iterator; diff --git a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java index a67de6be17b..19dfc361e47 100644 --- a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java @@ -21,12 +21,8 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.util.StrUtils; - import java.util.Map; -import java.io.File; -import java.util.List; import java.io.IOException; @@ -40,21 +36,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement String wordFiles = args.get(PROTECTED_TOKENS); if (wordFiles != null) { try { - File protectedWordFiles = new File(wordFiles); - if (protectedWordFiles.exists()) { - List wlist = loader.getLines(wordFiles); - //This cast is safe in Lucene - protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally - } else { - List files = StrUtils.splitFileNames(wordFiles); - for (String file : files) { - List wlist = loader.getLines(file.trim()); - if (protectedWords == null) - protectedWords = new CharArraySet(wlist, false); - else - protectedWords.addAll(wlist); - } - } + protectedWords = getWordSet(loader, wordFiles, false); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/solr/src/java/org/apache/solr/util/CharArrayMap.java b/solr/src/java/org/apache/solr/util/CharArrayMap.java deleted file mode 100755 index e2f9c714723..00000000000 --- a/solr/src/java/org/apache/solr/util/CharArrayMap.java +++ /dev/null @@ -1,411 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.util; - -import java.util.*; -import java.io.Serializable; - -/** - * A simple class that stores key Strings as char[]'s in a - * hash table. Note that this is not a general purpose - * class. For example, it cannot remove items from the - * map, nor does it resize its hash table to be smaller, - * etc. It is designed to be quick to retrieve items - * by char[] keys without the necessity of converting - * to a String first. - */ - -public class CharArrayMap extends AbstractMap - implements Map, Cloneable, Serializable -{ - private final static int INIT_SIZE = 2; - private char[][] keys; - private Object[] values; - private int count; - private final boolean ignoreCase; - - /** Create map with enough capacity to hold startSize - * terms */ - public CharArrayMap(int initialCapacity, boolean ignoreCase) { - this.ignoreCase = ignoreCase; - int size = INIT_SIZE; - // load factor of .75, inverse is 1.25, or x+x/4 - initialCapacity = initialCapacity + (initialCapacity >>2); - while(size <= initialCapacity) - size <<= 1; - keys = new char[size][]; - values = new Object[size]; - } - - public boolean ignoreCase() { - return ignoreCase; - } - - public V get(char[] key) { - return get(key, 0, key.length); - } - - public V get(char[] key, int off, int len) { - return (V)values[getSlot(key, off, len)]; - } - - public V get(CharSequence key) { - return (V)values[getSlot(key)]; - } - - @Override - public V get(Object key) { - return (V)values[getSlot(key)]; - } - - @Override - public boolean containsKey(Object s) { - return keys[getSlot(s)] != null; - } - - @Override - public boolean containsValue(Object value) { - if (value == null) { - // search for key with a null value - for (int i=0; i>8)+code)|1; - do { - code += inc; - pos = code & (keys.length-1); - key2 = keys[pos]; - } while (key2 != null && !equals(key, off, len, key2)); - } - return pos; - } - - /** Returns true if the String is in the set */ - private int getSlot(CharSequence key) { - int code = getHashCode(key); - int pos = code & (keys.length-1); - char[] key2 = keys[pos]; - if (key2 != null && !equals(key, key2)) { - final int inc = ((code>>8)+code)|1; - do { - code += inc; - pos = code & (keys.length-1); - key2 = keys[pos]; - } while (key2 != null && !equals(key, key2)); - } - return pos; - } - - public V put(CharSequence key, V val) { - return put(key.toString(), val); // could be more efficient - } - - @Override - public V put(String key, V val) { - return put(key.toCharArray(), val); - } - - /** Add this key,val pair to the map. - * The char[] key is directly used, no copy is made. - * If ignoreCase is true for this Map, the key array will be directly modified. - * The user should never modify the key after calling this method. - */ - public V put(char[] key, Object val) { - if (ignoreCase) - for(int i=0;i< key.length;i++) - key[i] = Character.toLowerCase(key[i]); - int slot = getSlot(key, 0, key.length); - if (keys[slot] == null) count++; - Object prev = values[slot]; - keys[slot] = key; - values[slot] = val; - - if (count + (count>>2) >= keys.length) { - rehash(); - } - - return (V)prev; - } - - - private boolean equals(char[] text1, int off, int len, char[] text2) { - if (len != text2.length) - return false; - if (ignoreCase) { - for(int i=0;i> entrySet() { - return new EntrySet(); - } - - /** Returns an EntryIterator over this Map. */ - public EntryIterator iterator() { - return new EntryIterator(); - } - - /** public iterator class so efficient methods are exposed to users */ - public class EntryIterator implements Iterator> { - int pos=-1; - int lastPos; - - EntryIterator() { - goNext(); - } - - private void goNext() { - lastPos = pos; - pos++; - while (pos < keys.length && keys[pos] == null) pos++; - } - - public boolean hasNext() { - return pos < keys.length; - } - - /** gets the next key... do not modify the returned char[] */ - public char[] nextKey() { - goNext(); - return keys[lastPos]; - } - - /** gets the next key as a newly created String object */ - public String nextKeyString() { - return new String(nextKey()); - } - - /** returns the value associated with the last key returned */ - public V currentValue() { - return (V)values[lastPos]; - } - - /** sets the value associated with the last key returned */ - public V setValue(V value) { - V old = (V)values[lastPos]; - values[lastPos] = value; - return old; - } - - /** Returns an Entry object created on the fly... - * use nextCharArray() + currentValie() for better efficiency. */ - public Map.Entry next() { - goNext(); - return new MapEntry(lastPos); - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - - - private class MapEntry implements Map.Entry { - final int pos; - - MapEntry(int pos) { - this.pos = pos; - } - - public char[] getCharArr() { - return keys[pos]; - } - - public String getKey() { - return new String(getCharArr()); - } - - public V getValue() { - return (V)values[pos]; - } - - public V setValue(V value) { - V old = (V)values[pos]; - values[pos] = value; - return old; - } - - public String toString() { - return getKey() + '=' + getValue(); - } - } - - - - private class EntrySet extends AbstractSet> { - public EntryIterator iterator() { - return new EntryIterator(); - } - public boolean contains(Object o) { - if (!(o instanceof Map.Entry)) - return false; - Map.Entry e = (Map.Entry)o; - Object key = e.getKey(); - if (key==null) return false; // we don't support null keys - Object val = e.getValue(); - Object v = get(key); - return v==null ? val==null : v.equals(val); - } - public boolean remove(Object o) { - throw new UnsupportedOperationException(); - } - public int size() { - return count; - } - public void clear() { - CharArrayMap.this.clear(); - } - } - - @Override - public Object clone() { - CharArrayMap map = null; - try { - map = (CharArrayMap)super.clone(); - map.keys = keys.clone(); - map.values = values.clone(); - } catch (CloneNotSupportedException e) { - // impossible - } - return map; - } -} diff --git a/solr/src/test/org/apache/solr/analysis/BaseTokenTestCase.java b/solr/src/test/org/apache/solr/analysis/BaseTokenTestCase.java index 5bbe6b8dc2e..ce3338d0b8e 100644 --- a/solr/src/test/org/apache/solr/analysis/BaseTokenTestCase.java +++ b/solr/src/test/org/apache/solr/analysis/BaseTokenTestCase.java @@ -21,13 +21,18 @@ import java.util.Collections; import java.util.Map; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; +import org.apache.solr.core.Config; /** * General token testing helper functions */ public abstract class BaseTokenTestCase extends BaseTokenStreamTestCase { + /** a map containing the default test version param for easy testing */ protected static final Map DEFAULT_VERSION_PARAM = Collections.singletonMap("luceneMatchVersion", System.getProperty("tests.luceneMatchVersion", "LUCENE_CURRENT")); + /** The default test version for easy testing */ + public static final Version DEFAULT_VERSION = Config.parseLuceneVersionString(DEFAULT_VERSION_PARAM.get("luceneMatchVersion")); } diff --git a/solr/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java b/solr/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java index acd5e418f3c..6da1129b648 100644 --- a/solr/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java +++ b/solr/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java @@ -39,12 +39,12 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase { ResourceLoader loader = new SolrResourceLoader(null, null); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("words", "stop-1.txt"); args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + Set words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); @@ -71,13 +71,13 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase { ResourceLoader loader = new SolrResourceLoader(null, null); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + Set words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory")); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory")); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "testing", "testing_the", "the", "the_factory", "factory" }); diff --git a/solr/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java b/solr/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java index 69d95ab2ea2..0b0f4067e14 100644 --- a/solr/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java +++ b/solr/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java @@ -35,10 +35,10 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { public void testReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); - TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class); + TermAttribute term = cgf.addAttribute(TermAttribute.class); assertTrue(cgf.incrementToken()); assertEquals("How", term.term()); assertTrue(cgf.incrementToken()); @@ -56,11 +56,11 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { public void testQueryReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); - TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class); + TermAttribute term = wt.addAttribute(TermAttribute.class); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.term()); assertTrue(nsf.incrementToken()); @@ -88,7 +88,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { @Override public TokenStream tokenStream(String field, Reader in) { return new CommonGramsQueryFilter(new CommonGramsFilter( - new WhitespaceTokenizer(in), commonWords)); + new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords)); } }; @@ -157,7 +157,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { @Override public TokenStream tokenStream(String field, Reader in) { return new CommonGramsFilter( - new WhitespaceTokenizer(in), commonWords); + new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords); } }; @@ -243,7 +243,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { */ public void testCaseSensitive() throws Exception { final String input = "How The s a brown s cow d like A B thing?"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); Set common = CommonGramsFilter.makeCommonSet(commonWords); TokenFilter cgf = new CommonGramsFilter(wt, common, false); assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s", @@ -256,7 +256,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { */ public void testLastWordisStopWord() throws Exception { final String input = "dog the"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "dog_the" }); @@ -267,7 +267,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { */ public void testFirstWordisStopWord() throws Exception { final String input = "the dog"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the_dog" }); @@ -278,7 +278,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { */ public void testOneWordQueryStopWord() throws Exception { final String input = "the"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the" }); @@ -289,7 +289,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { */ public void testOneWordQuery() throws Exception { final String input = "monster"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "monster" }); @@ -300,7 +300,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase { */ public void TestFirstAndLastStopWord() throws Exception { final String input = "the of"; - WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the_of" }); diff --git a/solr/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java b/solr/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java index 5e29e41e49a..bf093005855 100644 --- a/solr/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java +++ b/solr/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java @@ -38,12 +38,12 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase { ResourceLoader loader = new SolrResourceLoader(null, null); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("words", "stop-1.txt"); args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + Set words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); @@ -70,13 +70,13 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase { ResourceLoader loader = new SolrResourceLoader(null, null); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + Set words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory")); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("testing the factory")); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "testing_the", "the_factory" }); diff --git a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java index 35f491fee7a..87d7098c2f0 100644 --- a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java +++ b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java @@ -29,7 +29,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase { public void testDefaults() throws Exception { DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); factory.init(new HashMap()); - TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international")); + TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filteredStream = factory.create(inputStream); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); @@ -43,7 +43,7 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase { parameters.put("maxCodeLength", "8"); factory.init(parameters); - TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international")); + TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filteredStream = factory.create(inputStream); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); @@ -56,10 +56,10 @@ public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase { public void testReset() throws Exception { DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); factory.init(new HashMap()); - TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international")); + TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filteredStream = factory.create(inputStream); - TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class); + TermAttribute termAtt = filteredStream.addAttribute(TermAttribute.class); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertTrue(filteredStream.incrementToken()); diff --git a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java index cef1ac3d656..f05d24a37ac 100644 --- a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java +++ b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java @@ -24,42 +24,42 @@ import org.apache.lucene.analysis.WhitespaceTokenizer; public class DoubleMetaphoneFilterTest extends BaseTokenTestCase { public void testSize4FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); assertTokenStreamContents(filter, new String[] { "ANTR" }); } public void testSize4TrueInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); assertTokenStreamContents(filter, new String[] { "international", "ANTR" }); } public void testAlternateInjectFalse() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski")); + TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); } public void testSize8FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); } public void testNonConvertableStringsWithInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&")); + TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); } public void testNonConvertableStringsWithoutInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&")); + TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); // should have something after the stream - stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello")); + stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello")); filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); } diff --git a/solr/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java b/solr/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java index 8f1c22dd0da..ac37331d3ba 100644 --- a/solr/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java +++ b/solr/src/test/org/apache/solr/analysis/EnglishPorterFilterFactoryTest.java @@ -46,11 +46,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase { } EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); factory.init(args); factory.inform(new LinesMockSolrResourceLoader(new ArrayList())); - Tokenizer tokenizer = new WhitespaceTokenizer( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(StrUtils.join(Arrays.asList(test), ' '))); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, gold); @@ -71,13 +71,13 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase { } EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt"); factory.init(args); List lines = new ArrayList(); Collections.addAll(lines, "banks", "fledgling"); factory.inform(new LinesMockSolrResourceLoader(lines)); - Tokenizer tokenizer = new WhitespaceTokenizer( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(StrUtils.join(Arrays.asList(test), ' '))); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, gold); diff --git a/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java b/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java index 9b642dacc34..300e4b43388 100644 --- a/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java +++ b/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java @@ -33,7 +33,7 @@ public class LengthFilterTest extends BaseTokenTestCase { args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); factory.init(args); String test = "foo foobar super-duper-trooper"; - TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test))); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test))); assertTokenStreamContents(stream, new String[] { "foobar" }); } } \ No newline at end of file diff --git a/solr/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java b/solr/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java index bc5999b459d..e786c0017c2 100644 --- a/solr/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java +++ b/solr/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java @@ -48,12 +48,12 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase { } SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("language", "English"); factory.init(args); factory.inform(new LinesMockSolrResourceLoader(new ArrayList())); - Tokenizer tokenizer = new WhitespaceTokenizer( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(StrUtils.join(Arrays.asList(test), ' '))); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, gold); @@ -78,13 +78,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase { } EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(SnowballPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt"); factory.init(args); List lines = new ArrayList(); Collections.addAll(lines, "banks", "fledgling"); factory.inform(new LinesMockSolrResourceLoader(lines)); - Tokenizer tokenizer = new WhitespaceTokenizer( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(StrUtils.join(Arrays.asList(test), ' '))); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, gold); @@ -116,13 +116,13 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase { public void testProtected() throws Exception { SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); ResourceLoader loader = new SolrResourceLoader(null, null); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("protected", "protwords.txt"); args.put("language", "English"); factory.init(args); factory.inform(loader); Reader reader = new StringReader("ridding of some stemming"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" }); } diff --git a/solr/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java index d8e26014eb1..bba889336cb 100644 --- a/solr/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestBrazilianStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("Brasília"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "brasil" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java b/solr/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java index 2b2a35b788f..a69bb785643 100644 --- a/solr/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java +++ b/solr/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java @@ -59,7 +59,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase { final String input = "How now A B brown A cow B like A B thing?"; final String expected = "How now Q B brown A cow B like Q B thing?"; TokenStream ts = new AB_Q_Stream - (new WhitespaceTokenizer(new StringReader(input))); + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); assertTokenStreamContents(ts, expected.split("\\s")); } @@ -67,15 +67,15 @@ public class TestBufferedTokenStream extends BaseTokenTestCase { final String input = "How now A B brown A cow B like A B thing?"; final String expected = "How now A A B brown A cow B like A A B thing?"; TokenStream ts = new AB_AAB_Stream - (new WhitespaceTokenizer(new StringReader(input))); + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); assertTokenStreamContents(ts, expected.split("\\s")); } public void testReset() throws Exception { final String input = "How now A B brown A cow B like A B thing?"; - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input)); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); TokenStream ts = new AB_AAB_Stream(tokenizer); - TermAttribute term = (TermAttribute) ts.addAttribute(TermAttribute.class); + TermAttribute term = ts.addAttribute(TermAttribute.class); assertTrue(ts.incrementToken()); assertEquals("How", term.term()); assertTrue(ts.incrementToken()); diff --git a/solr/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java index d5fb5ff454d..1a9a5585614 100644 --- a/solr/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestBulgarianStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("компютри"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "компютр" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java b/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java index 20992f2568a..70f24f93b06 100644 --- a/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java @@ -34,7 +34,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { public void testCapitalization() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" ); args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); @@ -74,18 +74,18 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { // now each token factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan")); + tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan")); stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); // now only the long words factory.minWordLength = 3; - tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" )); + tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" )); stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); // without prefix - tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" )); + tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "Mckinley" }); @@ -93,14 +93,14 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { factory = new CapitalizationFilterFactory(); args.put( "okPrefix", "McK" ); // all words factory.init( args ); - tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" )); + tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "McKinley" }); // now try some stuff with numbers factory.forceFirstLetter = false; factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" )); + tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" )); stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" }); @@ -111,7 +111,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { } public void testKeepIgnoreCase() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put( CapitalizationFilterFactory.KEEP, "kitten" ); args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" ); args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); @@ -141,12 +141,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { * This is very weird when combined with ONLY_FIRST_WORD!!! */ public void testMinWordLength() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true"); args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5"); CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader( "helo testing")); TokenStream ts = factory.create(tokenizer); assertTokenStreamContents(ts, new String[] {"helo", "Testing"}); @@ -157,11 +157,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { * in each token (it should do nothing) */ public void testMaxWordCount() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader( "one two three four")); TokenStream ts = factory.create(tokenizer); assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"}); @@ -171,7 +171,7 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { * Test CapitalizationFilterFactory's maxWordCount option when exceeded */ public void testMaxWordCount2() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init(args); @@ -187,11 +187,11 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { * This is weird, it is not really a max, but inclusive (look at 'is') */ public void testMaxTokenLength() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2"); CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader( "this is a test")); TokenStream ts = factory.create(tokenizer); assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"}); @@ -201,12 +201,12 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { * Test CapitalizationFilterFactory's forceFirstLetter option */ public void testForceFirstLetter() throws Exception { - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put(CapitalizationFilterFactory.KEEP, "kitten"); args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true"); CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten")); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kitten")); TokenStream ts = factory.create(tokenizer); assertTokenStreamContents(ts, new String[] {"Kitten"}); } diff --git a/solr/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java index 83a931e758b..8c0d9e39526 100644 --- a/solr/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestChineseFilterFactory.java @@ -33,7 +33,7 @@ public class TestChineseFilterFactory extends BaseTokenTestCase { */ public void testFiltering() throws Exception { Reader reader = new StringReader("this 1234 Is such a silly filter"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ChineseFilterFactory factory = new ChineseFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java index 3ec30dac61c..69b7342a39b 100644 --- a/solr/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestCollationKeyFilterFactory.java @@ -177,9 +177,9 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase { private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException { - TermAttribute term1 = (TermAttribute) stream1 + TermAttribute term1 = stream1 .addAttribute(TermAttribute.class); - TermAttribute term2 = (TermAttribute) stream2 + TermAttribute term2 = stream2 .addAttribute(TermAttribute.class); assertTrue(stream1.incrementToken()); assertTrue(stream2.incrementToken()); diff --git a/solr/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java index dfeb7c12ebf..5f941948e2b 100644 --- a/solr/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestCzechStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("angličtí"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); CzechStemFilterFactory factory = new CzechStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "anglick" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java index f4e8aaee8e9..bc7f4b97a7f 100644 --- a/solr/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java @@ -21,8 +21,6 @@ import java.io.StringReader; import java.util.HashMap; import java.util.Map; -import junit.framework.TestCase; - import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; @@ -32,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.solr.common.ResourceLoader; import org.apache.solr.core.SolrResourceLoader; -public class TestDelimitedPayloadTokenFilterFactory extends TestCase { +public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenTestCase { public void testEncoder() throws Exception { Map args = new HashMap(); @@ -42,10 +40,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase { ResourceLoader loader = new SolrResourceLoader(null, null); factory.inform(loader); - TokenStream input = new WhitespaceTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1")); + TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the|0.1 quick|0.1 red|0.1")); DelimitedPayloadTokenFilter tf = factory.create(input); while (tf.incrementToken()){ - PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class); + PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); assertTrue("payAttr is null and it shouldn't be", payAttr != null); byte[] payData = payAttr.getPayload().getData(); assertTrue("payData is null and it shouldn't be", payData != null); @@ -64,10 +62,10 @@ public class TestDelimitedPayloadTokenFilterFactory extends TestCase { ResourceLoader loader = new SolrResourceLoader(null, null); factory.inform(loader); - TokenStream input = new WhitespaceTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1")); + TokenStream input = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("the*0.1 quick*0.1 red*0.1")); DelimitedPayloadTokenFilter tf = factory.create(input); while (tf.incrementToken()){ - PayloadAttribute payAttr = (PayloadAttribute) tf.getAttribute(PayloadAttribute.class); + PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); assertTrue("payAttr is null and it shouldn't be", payAttr != null); byte[] payData = payAttr.getPayload().getData(); assertTrue("payData is null and it shouldn't be", payData != null); diff --git a/solr/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java index 55623037b1f..85fbc5c9f26 100644 --- a/solr/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java @@ -37,10 +37,10 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestC */ public void testDecompounding() throws Exception { Reader reader = new StringReader("I like to play softball"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory(); ResourceLoader loader = new SolrResourceLoader(null, null); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("dictionary", "compoundDictionary.txt"); factory.init(args); factory.inform(loader); diff --git a/solr/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java index 41e20e22618..aee235c523c 100644 --- a/solr/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestDutchStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestDutchStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("lichamelijkheden"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); DutchStemFilterFactory factory = new DutchStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "licham" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java index f542a6cb135..feb374908d9 100644 --- a/solr/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java @@ -37,7 +37,7 @@ public class TestElisionFilterFactory extends BaseTokenTestCase { */ public void testElision() throws Exception { Reader reader = new StringReader("l'avion"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ElisionFilterFactory factory = new ElisionFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); ResourceLoader loader = new SolrResourceLoader(null, null); diff --git a/solr/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java index aa7ede14e40..7a32060da08 100644 --- a/solr/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestFrenchStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestFrenchStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("habitable"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); FrenchStemFilterFactory factory = new FrenchStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "habit" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java index 5b66c77342c..618ab521f72 100644 --- a/solr/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestGermanStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("Tischen"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); GermanStemFilterFactory factory = new GermanStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "tisch" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java index 9e6fd173d27..f697020e151 100644 --- a/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java @@ -33,7 +33,7 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java b/solr/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java index 9faf6c23ba8..0652eba21c4 100755 --- a/solr/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java @@ -29,7 +29,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public void testHyphenatedWords() throws Exception { String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal"; // first test - TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); ts = factory.create(ts); assertTokenStreamContents(ts, @@ -42,7 +42,7 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public void testHyphenAtEnd() throws Exception { String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-"; // first test - TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory(); ts = factory.create(ts); assertTokenStreamContents(ts, diff --git a/solr/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java index 83aa79dd3cf..2a9aba5b115 100644 --- a/solr/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java @@ -23,25 +23,22 @@ import java.util.Set; import java.util.Map; import java.util.HashMap; -import junit.framework.TestCase; - - /** * * **/ -public class TestKeepFilterFactory extends TestCase{ +public class TestKeepFilterFactory extends BaseTokenTestCase{ public void testInform() throws Exception { ResourceLoader loader = new SolrResourceLoader(null, null); assertTrue("loader is null and it shouldn't be", loader != null); KeepWordFilterFactory factory = new KeepWordFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("words", "keep-1.txt"); args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getWords(); + Set words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); diff --git a/solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java b/solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java index 55ac2d72287..7a880de393d 100644 --- a/solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java @@ -41,7 +41,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase { words.add( "bbb" ); String input = "aaa BBB ccc ddd EEE"; - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); ResourceLoader loader = new SolrResourceLoader(null, null); // Test Stopwords @@ -51,29 +51,29 @@ public class TestKeepWordFilter extends BaseTokenTestCase { factory.inform( loader ); factory.setWords( words ); assertTrue(factory.isIgnoreCase()); - TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); // Test Stopwords (ignoreCase via the setter instead) factory = new KeepWordFilterFactory(); - args = new HashMap(); + args = new HashMap(DEFAULT_VERSION_PARAM); factory.init( args ); factory.inform( loader ); factory.setIgnoreCase(true); factory.setWords( words ); assertTrue(factory.isIgnoreCase()); - stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); + stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); // Now force case factory = new KeepWordFilterFactory(); - args = new HashMap(); + args = new HashMap(DEFAULT_VERSION_PARAM); args.put( "ignoreCase", "false" ); factory.init( args ); factory.inform( loader ); factory.setWords( words ); assertFalse(factory.isIgnoreCase()); - stream = factory.create(new WhitespaceTokenizer(new StringReader(input))); + stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); assertTokenStreamContents(stream, new String[] { "aaa" }); } } diff --git a/solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java new file mode 100644 index 00000000000..cec702cbaba --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java @@ -0,0 +1,65 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.core.SolrResourceLoader; + +/** + * Simple tests to ensure the keyword marker filter factory is working. + */ +public class TestKeywordMarkerFilterFactory extends BaseTokenTestCase { + public void testKeywords() throws IOException { + Reader reader = new StringReader("dogs cats"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); + ResourceLoader loader = new SolrResourceLoader(null, null); + args.put("protected", "protwords.txt"); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "dog", "cats" }); + } + + public void testKeywordsCaseInsensitive() throws IOException { + Reader reader = new StringReader("dogs cats Cats"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); + ResourceLoader loader = new SolrResourceLoader(null, null); + args.put("protected", "protwords.txt"); + args.put("ignoreCase", "true"); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java b/solr/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java index c0a42a65fd8..54032490611 100644 --- a/solr/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java +++ b/solr/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java @@ -20,7 +20,7 @@ public class TestMultiWordSynonyms extends BaseTokenTestCase { SynonymMap synMap = new SynonymMap(true); SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null); - SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap); + SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap); // This fails because ["e","e"] is the value of the token stream assertTokenStreamContents(ts, new String[] { "a", "e" }); } diff --git a/solr/src/test/org/apache/solr/analysis/TestNGramFilters.java b/solr/src/test/org/apache/solr/analysis/TestNGramFilters.java index 21027b41c19..0b31ee20371 100644 --- a/solr/src/test/org/apache/solr/analysis/TestNGramFilters.java +++ b/solr/src/test/org/apache/solr/analysis/TestNGramFilters.java @@ -64,7 +64,7 @@ public class TestNGramFilters extends BaseTokenTestCase { Map args = new HashMap(); NGramFilterFactory factory = new NGramFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] { "t", "e", "s", "t", "te", "es", "st" }); } @@ -78,7 +78,7 @@ public class TestNGramFilters extends BaseTokenTestCase { args.put("maxGramSize", "3"); NGramFilterFactory factory = new NGramFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] { "te", "es", "st", "tes", "est" }); } @@ -129,7 +129,7 @@ public class TestNGramFilters extends BaseTokenTestCase { Map args = new HashMap(); EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] { "t" }); } @@ -143,7 +143,7 @@ public class TestNGramFilters extends BaseTokenTestCase { args.put("maxGramSize", "2"); EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] { "t", "te" }); } @@ -156,7 +156,7 @@ public class TestNGramFilters extends BaseTokenTestCase { args.put("side", "back"); EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] { "y" }); } diff --git a/solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java b/solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java index bd36b721ac2..fe35f05bf29 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java @@ -47,7 +47,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { factory.init(args); CharStream cs = factory.create( CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "this", "is", "test." }, new int[] { 0, 5, 8 }, @@ -64,7 +64,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { factory.init(args); CharStream cs = factory.create( CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertFalse(ts.incrementToken()); } @@ -80,7 +80,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { factory.init(args); CharStream cs = factory.create( CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb#cc" }, new int[] { 0 }, @@ -95,7 +95,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { final String BLOCK = "aa bb cc dd"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa##bb###cc", "dd" }, new int[] { 0, 9 }, @@ -109,7 +109,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { final String BLOCK = " a a"; CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa", "aa" }, new int[] { 1, 4 }, @@ -124,7 +124,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { final String BLOCK = "aa bb cc dd"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa#bb", "dd" }, new int[] { 0, 12 }, @@ -139,7 +139,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { final String BLOCK = " aa bb cc --- aa bb aa bb cc"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, @@ -154,7 +154,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", CharReader.get( new StringReader( BLOCK ) ) ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, @@ -171,7 +171,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase { CharReader.get( new StringReader( BLOCK ) ) ); cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); - TokenStream ts = new WhitespaceTokenizer( cs ); + TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); assertTokenStreamContents(ts, new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, diff --git a/solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java b/solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java index 0769bfcaa31..6d9b30f0828 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java @@ -31,7 +31,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase { public void testReplaceAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), Pattern.compile("a*b"), "-", true); assertTokenStreamContents(ts, @@ -41,7 +41,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase { public void testReplaceFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), Pattern.compile("a*b"), "-", false); assertTokenStreamContents(ts, @@ -51,7 +51,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase { public void testStripFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), Pattern.compile("a*b"), null, false); assertTokenStreamContents(ts, @@ -61,7 +61,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase { public void testStripAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), Pattern.compile("a*b"), null, true); assertTokenStreamContents(ts, @@ -71,7 +71,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase { public void testReplaceAllWithBackRef() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter - (new WhitespaceTokenizer(new StringReader(input)), + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), Pattern.compile("(a*)b"), "$1\\$", true); assertTokenStreamContents(ts, diff --git a/solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java b/solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java index 11318eece76..6faf2187d97 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java @@ -117,7 +117,7 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase */ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); - TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class); + TermAttribute termAtt = in.addAttribute(TermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); diff --git a/solr/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java index c263477212d..8831f366d3a 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java @@ -33,7 +33,7 @@ public class TestPersianNormalizationFilterFactory extends BaseTokenTestCase { */ public void testNormalization() throws Exception { Reader reader = new StringReader("های"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "هاي" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java b/solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java index 62c51ca662e..6346a89f374 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java @@ -83,7 +83,7 @@ public class TestPhoneticFilter extends BaseTokenTestCase { static void assertAlgorithm(String algName, String inject, String input, String[] expected) throws Exception { - Tokenizer tokenizer = new WhitespaceTokenizer( + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); Map args = new HashMap(); args.put("encoder", algName); diff --git a/solr/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java index c81458e2b04..acfc2a77a12 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestPorterStemFilterFactory.java @@ -33,7 +33,7 @@ public class TestPorterStemFilterFactory extends BaseTokenTestCase { */ public void testStemming() throws Exception { Reader reader = new StringReader("dogs"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); PorterStemFilterFactory factory = new PorterStemFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "dog" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java b/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java index 11abd65f256..ceeb2d1b604 100644 --- a/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java @@ -17,7 +17,6 @@ package org.apache.solr.analysis; -import junit.framework.TestCase; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -45,9 +44,9 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase { RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); final TokenStream ts = factory.create (new TokenStream() { - TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + TermAttribute termAtt = addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); public boolean incrementToken() { if (toks.hasNext()) { clearAttributes(); diff --git a/solr/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java index 2aff8a75277..9672693a868 100644 --- a/solr/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java @@ -33,7 +33,7 @@ public class TestReverseStringFilterFactory extends BaseTokenTestCase { */ public void testReversing() throws Exception { Reader reader = new StringReader("simple test"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ReverseStringFilterFactory factory = new ReverseStringFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); TokenStream stream = factory.create(tokenizer); diff --git a/solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java index 3e768776392..a457df9a5b8 100644 --- a/solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java @@ -58,7 +58,7 @@ public class TestReversedWildcardFilterFactory extends SolrTestCaseJ4 { String text = "simple text"; args.put("withOriginal", "true"); factory.init(args); - TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text))); + TokenStream input = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(text))); assertTokenStreamContents(input, new String[] { "\u0001elpmis", "simple", "\u0001txet", "text" }, new int[] { 1, 0, 1, 0 }); @@ -66,7 +66,7 @@ public class TestReversedWildcardFilterFactory extends SolrTestCaseJ4 { // now without original tokens args.put("withOriginal", "false"); factory.init(args); - input = factory.create(new WhitespaceTokenizer(new StringReader(text))); + input = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(text))); assertTokenStreamContents(input, new String[] { "\u0001elpmis", "\u0001txet" }, new int[] { 1, 1 }); diff --git a/solr/src/test/org/apache/solr/analysis/TestRussianFilters.java b/solr/src/test/org/apache/solr/analysis/TestRussianFilters.java index 6ad57acaa76..973900d295d 100644 --- a/solr/src/test/org/apache/solr/analysis/TestRussianFilters.java +++ b/solr/src/test/org/apache/solr/analysis/TestRussianFilters.java @@ -19,8 +19,6 @@ package org.apache.solr.analysis; import java.io.Reader; import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; diff --git a/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java index a0b35c14f44..ede684971ac 100644 --- a/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java @@ -37,7 +37,7 @@ public class TestShingleFilterFactory extends BaseTokenTestCase { Map args = new HashMap(); ShingleFilterFactory factory = new ShingleFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] {"this", "this is", "is", "is a", "a", "a test", "test"}); } @@ -51,7 +51,7 @@ public class TestShingleFilterFactory extends BaseTokenTestCase { args.put("outputUnigrams", "false"); ShingleFilterFactory factory = new ShingleFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] {"this is", "is a", "a test"}); } @@ -65,7 +65,7 @@ public class TestShingleFilterFactory extends BaseTokenTestCase { args.put("maxShingleSize", "3"); ShingleFilterFactory factory = new ShingleFilterFactory(); factory.init(args); - TokenStream stream = factory.create(new WhitespaceTokenizer(reader)); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); assertTokenStreamContents(stream, new String[] {"this", "this is", "this is a", "is", "is a", "is a test", "a", "a test", "test"}); diff --git a/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java b/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java index 229a9ff11e1..62c807f5b7f 100644 --- a/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java +++ b/solr/src/test/org/apache/solr/analysis/TestStandardFactories.java @@ -108,7 +108,7 @@ public class TestStandardFactories extends BaseTokenTestCase { */ public void testASCIIFolding() throws Exception { Reader reader = new StringReader("Česká"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); TokenStream stream = factory.create(tokenizer); @@ -121,7 +121,7 @@ public class TestStandardFactories extends BaseTokenTestCase { */ public void testISOLatin1Folding() throws Exception { Reader reader = new StringReader("Česká"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ISOLatin1AccentFilterFactory factory = new ISOLatin1AccentFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); TokenStream stream = factory.create(tokenizer); diff --git a/solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java new file mode 100644 index 00000000000..6c173583a5c --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java @@ -0,0 +1,66 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.core.SolrResourceLoader; + +/** + * Simple tests to ensure the stemmer override filter factory is working. + */ +public class TestStemmerOverrideFilterFactory extends BaseTokenTestCase { + public void testKeywords() throws IOException { + // our stemdict stems dogs to 'cat' + Reader reader = new StringReader("testing dogs"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); + ResourceLoader loader = new SolrResourceLoader(null, null); + args.put("dictionary", "stemdict.txt"); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "test", "cat" }); + } + + public void testKeywordsCaseInsensitive() throws IOException { + Reader reader = new StringReader("testing DoGs"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); + ResourceLoader loader = new SolrResourceLoader(null, null); + args.put("dictionary", "stemdict.txt"); + args.put("ignoreCase", "true"); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "test", "cat" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestStopFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestStopFilterFactory.java index d024e870cbf..f799c85907a 100644 --- a/solr/src/test/org/apache/solr/analysis/TestStopFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestStopFilterFactory.java @@ -24,25 +24,22 @@ import java.util.Set; import java.util.Map; import java.util.HashMap; -import junit.framework.TestCase; - - /** * * **/ -public class TestStopFilterFactory extends TestCase { +public class TestStopFilterFactory extends BaseTokenTestCase { public void testInform() throws Exception { ResourceLoader loader = new SolrResourceLoader(null, null); assertTrue("loader is null and it shouldn't be", loader != null); StopFilterFactory factory = new StopFilterFactory(); - Map args = new HashMap(); + Map args = new HashMap(DEFAULT_VERSION_PARAM); args.put("words", "stop-1.txt"); args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getStopWords(); + Set words = factory.getStopWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); diff --git a/solr/src/test/org/apache/solr/analysis/TestSynonymFilter.java b/solr/src/test/org/apache/solr/analysis/TestSynonymFilter.java index bdf26b70fc6..bf233bde835 100644 --- a/solr/src/test/org/apache/solr/analysis/TestSynonymFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestSynonymFilter.java @@ -47,14 +47,14 @@ public class TestSynonymFilter extends BaseTokenTestCase { static void assertTokenizesTo(SynonymMap dict, String input, String expected[]) throws IOException { - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input)); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); SynonymFilter stream = new SynonymFilter(tokenizer, dict); assertTokenStreamContents(stream, expected); } static void assertTokenizesTo(SynonymMap dict, String input, String expected[], int posIncs[]) throws IOException { - Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input)); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); SynonymFilter stream = new SynonymFilter(tokenizer, dict); assertTokenStreamContents(stream, expected, posIncs); } @@ -381,12 +381,12 @@ public class TestSynonymFilter extends BaseTokenTestCase { private static class IterTokenStream extends TokenStream { final Token tokens[]; int index = 0; - TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); - FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); - PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + TermAttribute termAtt = addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); public IterTokenStream(Token... tokens) { super(); diff --git a/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java index 7976123b805..090d1fa678a 100644 --- a/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java @@ -33,7 +33,7 @@ public class TestThaiWordFilterFactory extends BaseTokenTestCase { */ public void testWordBreak() throws Exception { Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); ThaiWordFilterFactory factory = new ThaiWordFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้", diff --git a/solr/src/test/org/apache/solr/analysis/TestTrimFilter.java b/solr/src/test/org/apache/solr/analysis/TestTrimFilter.java index 4d0f446c4ee..32fba1c30f5 100644 --- a/solr/src/test/org/apache/solr/analysis/TestTrimFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestTrimFilter.java @@ -81,12 +81,12 @@ public class TestTrimFilter extends BaseTokenTestCase { private static class IterTokenStream extends TokenStream { final Token tokens[]; int index = 0; - TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); - FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); - TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); - PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + TermAttribute termAtt = addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); public IterTokenStream(Token... tokens) { super(); diff --git a/solr/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java index 7f24a01fe6d..c533a53ad85 100644 --- a/solr/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java @@ -33,7 +33,7 @@ public class TestTurkishLowerCaseFilterFactory extends BaseTokenTestCase { */ public void testCasing() throws Exception { Reader reader = new StringReader("AĞACI"); - Tokenizer tokenizer = new WhitespaceTokenizer(reader); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); TurkishLowerCaseFilterFactory factory = new TurkishLowerCaseFilterFactory(); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "ağacı" }); diff --git a/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java b/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java index 2ecac44e80e..c2135878b04 100644 --- a/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java @@ -350,8 +350,8 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 { protected LargePosIncTokenFilter(TokenStream input) { super(input); - termAtt = (TermAttribute) addAttribute(TermAttribute.class); - posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + termAtt = addAttribute(TermAttribute.class); + posIncAtt = addAttribute(PositionIncrementAttribute.class); } @Override @@ -368,13 +368,13 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 { @Test public void testPositionIncrements() throws Exception { - final CharArraySet protWords = new CharArraySet(new HashSet(Arrays.asList("NUTCH")), false); + final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet(Arrays.asList("NUTCH")), false); /* analyzer that uses whitespace + wdf */ Analyzer a = new Analyzer() { public TokenStream tokenStream(String field, Reader reader) { return new WordDelimiterFilter( - new WhitespaceTokenizer(reader), + new WhitespaceTokenizer(DEFAULT_VERSION, reader), 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords); } }; @@ -401,7 +401,7 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 { public TokenStream tokenStream(String field, Reader reader) { return new WordDelimiterFilter( new LargePosIncTokenFilter( - new WhitespaceTokenizer(reader)), + new WhitespaceTokenizer(DEFAULT_VERSION, reader)), 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords); } }; diff --git a/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java b/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java index 782d5eeb2d2..efd9fdb9461 100644 --- a/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java +++ b/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java @@ -190,8 +190,8 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jumping", null, "", 0, 7, 1, null, false)); - tokenList = (List) queryResult.get("org.apache.solr.analysis.EnglishPorterFilter"); - assertNotNull("Expecting the 'EnglishPorterFilter' to be applied on the query for the 'text' field", tokenList); + tokenList = (List) queryResult.get("org.apache.lucene.analysis.snowball.SnowballFilter"); + assertNotNull("Expecting the 'SnowballFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jump", null, "", 0, 7, 1, null, false)); indexResult = textResult.get("index"); @@ -231,8 +231,8 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe assertToken(tokenList.get(1), new TokenInfo("jumped", null, "", 8, 14, 2, null, false)); assertToken(tokenList.get(2), new TokenInfo("over", null, "", 15, 19, 3, null, false)); assertToken(tokenList.get(3), new TokenInfo("dogs", null, "", 24, 28, 4, null, false)); - tokenList = valueResult.get("org.apache.solr.analysis.EnglishPorterFilter"); - assertNotNull("Expecting the 'EnglishPorterFilter' to be applied on the index for the 'text' field", tokenList); + tokenList = valueResult.get("org.apache.lucene.analysis.snowball.SnowballFilter"); + assertNotNull("Expecting the 'SnowballFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 4 tokens", 4, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 4, 7, 1, null, false)); assertToken(tokenList.get(1), new TokenInfo("jump", null, "", 8, 14, 2, null, true)); diff --git a/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java b/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java index 7e45ee0d2a1..95a8b116a7f 100644 --- a/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java +++ b/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java @@ -173,8 +173,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB assertToken(tokenList.get(5), new TokenInfo("lazy", null, "", 34, 38, 6, null, false)); assertToken(tokenList.get(6), new TokenInfo("brown", null, "", 39, 44, 7, null, true)); assertToken(tokenList.get(7), new TokenInfo("dogs", null, "", 45, 49, 8, null, false)); - tokenList = indexPart.get("org.apache.solr.analysis.EnglishPorterFilter"); - assertNotNull("Expcting EnglishPorterFilter analysis breakdown", tokenList); + tokenList = indexPart.get("org.apache.lucene.analysis.snowball.SnowballFilter"); + assertNotNull("Expcting SnowballFilter analysis breakdown", tokenList); assertEquals(tokenList.size(), 8); assertToken(tokenList.get(0), new TokenInfo("quick", null, "", 4, 9, 1, null, false)); assertToken(tokenList.get(1), new TokenInfo("red", null, "", 10, 13, 2, null, false)); @@ -208,8 +208,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB assertEquals(2, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); - tokenList = queryPart.get("org.apache.solr.analysis.EnglishPorterFilter"); - assertNotNull("Expcting EnglishPorterFilter analysis breakdown", tokenList); + tokenList = queryPart.get("org.apache.lucene.analysis.snowball.SnowballFilter"); + assertNotNull("Expcting SnowballFilter analysis breakdown", tokenList); assertEquals(2, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); diff --git a/solr/src/test/org/apache/solr/util/TestCharArrayMap.java b/solr/src/test/org/apache/solr/util/TestCharArrayMap.java deleted file mode 100755 index f0d9466b941..00000000000 --- a/solr/src/test/org/apache/solr/util/TestCharArrayMap.java +++ /dev/null @@ -1,209 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.util; - -import junit.framework.TestCase; - -import java.util.*; - -import org.apache.lucene.analysis.StopAnalyzer; - -public class TestCharArrayMap extends TestCase { - Random r = new Random(0); - - public void doRandom(int iter, boolean ignoreCase) { - CharArrayMap map = new CharArrayMap(1,ignoreCase); - HashMap hmap = new HashMap(); - - char[] key; - for (int i=0; i cm = new CharArrayMap(2,false); - HashMap hm = new HashMap(); - hm.put("foo",1); - hm.put("bar",2); - cm.putAll(hm); - assertEquals(hm, cm); - assertEquals(cm, hm); - hm.put("baz", 3); - assertFalse(hm.equals(cm)); - assertFalse(cm.equals(hm)); - assertTrue(cm.equals(cm)); - cm.putAll(hm); - assertEquals(hm, cm); - - Iterator> iter1 = cm.entrySet().iterator(); - int n=0; - while (iter1.hasNext()) { - Map.Entry entry = iter1.next(); - String key = entry.getKey(); - Integer val = entry.getValue(); - assertEquals(hm.get(key), val); - entry.setValue(val*100); - assertEquals(val*100, (int)cm.get(key)); - n++; - } - assertEquals(hm.size(), n); - cm.clear(); - cm.putAll(hm); - - CharArrayMap.EntryIterator iter2 = cm.iterator(); - n=0; - while (iter2.hasNext()) { - char[] keyc = iter2.nextKey(); - Integer val = iter2.currentValue(); - assertEquals(hm.get(new String(keyc)), val); - iter2.setValue(val*100); - assertEquals(val*100, (int)cm.get(keyc)); - n++; - } - assertEquals(hm.size(), n); - - cm.clear(); - assertEquals(0, cm.size()); - assertTrue(cm.isEmpty()); - } - - - // performance test vs HashMap - // HashMap will have an edge because we are testing with - // non-dynamically created keys and String caches hashCode - public static void main(String[] args) { - int a=0; - String impl = args[a++].intern(); // hash OR chars OR char - int iter1 = Integer.parseInt(args[a++]); // iterations of put() - int iter2 = Integer.parseInt(args[a++]); // iterations of get() - - int ret=0; - long start = System.currentTimeMillis(); - Set stopwords = (Set) StopAnalyzer.ENGLISH_STOP_WORDS_SET; - // words = "this is a different test to see what is really going on here... I hope it works well but I'm not sure it will".split(" "); - char[][] stopwordschars = new char[stopwords.size()][]; - Iterator it = stopwords.iterator(); - for (int i=0; i hm=null; - CharArrayMap cm=null; - - if (impl=="hash") { - for (int i=0; i(); - int v=0; - for (String word : stopwords) { - hm.put(word, ++v); - } - ret += hm.size(); - } - } else if (impl=="chars") { - for (int i=0; i(2,false); - int v=0; - for (String s : stopwords) { - cm.put(s,++v); - } - ret += cm.size(); - } - } else if (impl=="char") { - for (int i=0; i(2,false); - int v=0; - for (char[] s : stopwordschars) { - cm.put(s,++v); - } - ret += cm.size(); - } - } - - - if (impl=="hash") { - for (int i=0; i