mirror of https://github.com/apache/lucene.git
LUCENE-2055: better snowball integration, deprecate buggy handcoded snowball impls, restructure lang support
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@907125 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
57d1387492
commit
a6b7c5552b
|
@ -23,6 +23,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
|||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The Romanian analyzer (contrib/analyzers) comes with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt.
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The Bulgarian analyzer (contrib/analyzers) comes with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
||||
|
|
|
@ -27,6 +27,10 @@ Changes in runtime behavior
|
|||
used with Version > 3.0 and the TurkishStemmer.
|
||||
(Robert Muir via Simon Willnauer)
|
||||
|
||||
* LUCENE-2055: GermanAnalyzer now uses the Snowball German2 algorithm and
|
||||
stopwords list by default for Version > 3.0.
|
||||
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
|
||||
|
@ -53,6 +57,13 @@ Bug fixes
|
|||
* LUCENE-2207, LUCENE-2219: Fix incorrect offset calculations in end() for
|
||||
CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
|
||||
and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
|
||||
|
||||
* LUCENE-2055: Deprecated RussianTokenizer, RussianStemmer, RussianStemFilter,
|
||||
FrenchStemmer, FrenchStemFilter, DutchStemmer, and DutchStemFilter. For
|
||||
these Analyzers, SnowballFilter is used instead (for Version > 3.0), as
|
||||
the previous code did not always implement the Snowball algorithm correctly.
|
||||
Additionally, for Version > 3.0, the Snowball stopword lists are used by
|
||||
default. (Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||
|
||||
API Changes
|
||||
|
||||
|
@ -68,6 +79,12 @@ API Changes
|
|||
|
||||
* LUCENE-2204: Change some package private classes/members to publicly accessible to implement
|
||||
custom FragmentsBuilders. (Koji Sekiguchi)
|
||||
|
||||
* LUCENE-2055: Integrate snowball into contrib/analyzers. SnowballAnalyzer is
|
||||
now deprecated in favor of language-specific analyzers which contain things
|
||||
such as stopword lists and any language-specific processing in addition to
|
||||
stemming. Add Turkish and Romanian stopwords lists to support this.
|
||||
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||
|
||||
New features
|
||||
|
||||
|
@ -105,6 +122,10 @@ New features
|
|||
|
||||
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
|
||||
|
||||
* LUCENE-2055: Add analyzers/misc/StemmerOverrideFilter. This filter provides
|
||||
the ability to override any stemmer with a custom dictionary map.
|
||||
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.da;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Danish.
|
||||
*/
|
||||
public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Danish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new DanishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Danish.
|
||||
</body>
|
||||
</html>
|
|
@ -36,10 +36,12 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.German2Stemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for German language.
|
||||
|
@ -51,6 +53,16 @@ import org.apache.lucene.util.Version;
|
|||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GermanAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
|
||||
* Snowball stopwords are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
|
@ -60,7 +72,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* List of typical german stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
//TODO make this private in 3.1
|
||||
//TODO make this private in 3.1, remove in 4.0
|
||||
@Deprecated
|
||||
public final static String[] GERMAN_STOP_WORDS = {
|
||||
"einer", "eine", "eines", "einem", "einen",
|
||||
|
@ -77,6 +89,9 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
"durch", "wegen", "wird"
|
||||
};
|
||||
|
||||
/** File containing default German stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns a set of default German-stopwords
|
||||
* @return a set of default German-stopwords
|
||||
|
@ -86,8 +101,21 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
/** @deprecated remove in Lucene 4.0 */
|
||||
@Deprecated
|
||||
private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||
private static final Set<?> DEFAULT_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET =
|
||||
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -105,7 +133,9 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* {@link #getDefaultStopSet()}.
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
this(matchVersion,
|
||||
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
|
||||
: DefaultSetHolder.DEFAULT_SET_30);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -199,8 +229,9 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
*
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}, and
|
||||
* {@link GermanStemFilter}
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, and
|
||||
* {@link SnowballFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
|
@ -210,6 +241,10 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
result = new KeywordMarkerTokenFilter(result, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanStemFilter(result));
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new SnowballFilter(result, new German2Stemmer());
|
||||
else
|
||||
result = new GermanStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.StopFilter;
|
|||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -41,6 +42,15 @@ import java.util.Set;
|
|||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GreekAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StandardFilter is used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
|
@ -117,13 +127,15 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase
|
|||
*
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link StandardTokenizer} filtered with
|
||||
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
||||
* {@link GreekLowerCaseFilter}, {@link StandardFilter} and {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
final TokenStream result = new GreekLowerCaseFilter(source);
|
||||
TokenStream result = new GreekLowerCaseFilter(source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new StandardFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
package org.apache.lucene.analysis.en;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for English.
|
||||
*/
|
||||
public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link PorterStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new PorterStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for English.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.es;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Spanish.
|
||||
*/
|
||||
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Spanish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new SpanishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Spanish.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.fi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Finnish.
|
||||
*/
|
||||
public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Italian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new FinnishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Finnish.
|
||||
</body>
|
||||
</html>
|
|
@ -68,7 +68,7 @@ public final class ElisionFilter extends TokenFilter {
|
|||
/**
|
||||
* Constructs an elision filter with standard stop words
|
||||
*/
|
||||
protected ElisionFilter(Version matchVersion, TokenStream input) {
|
||||
public ElisionFilter(Version matchVersion, TokenStream input) {
|
||||
this(matchVersion, input, DEFAULT_ARTICLES);
|
||||
}
|
||||
|
||||
|
@ -77,7 +77,7 @@ public final class ElisionFilter extends TokenFilter {
|
|||
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
protected ElisionFilter(TokenStream input) {
|
||||
public ElisionFilter(TokenStream input) {
|
||||
this(Version.LUCENE_30, input);
|
||||
}
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
|
@ -55,6 +56,9 @@ import java.util.Set;
|
|||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating FrenchAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||
* LowerCaseFilter is used prior to StopFilter, and ElisionFilter and
|
||||
* Snowball stopwords are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
|
@ -68,7 +72,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
* Extended list of typical French stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
// TODO make this private in 3.1
|
||||
// TODO make this private in 3.1, remove in 4.0
|
||||
@Deprecated
|
||||
public final static String[] FRENCH_STOP_WORDS = {
|
||||
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
|
||||
|
@ -95,6 +99,9 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
"été", "être", "ô"
|
||||
};
|
||||
|
||||
/** File containing default French stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
|
@ -110,16 +117,31 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
/** @deprecated remove this in Lucene 4.0 */
|
||||
@Deprecated
|
||||
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
||||
false));
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
this(matchVersion,
|
||||
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||
: DefaultSetHolder.DEFAULT_STOP_SET_30);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -207,20 +229,34 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
* {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
|
||||
* filtered with {@link StandardFilter}, {@link ElisionFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||
* and {@link SnowballFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||
result = new FrenchStemFilter(result);
|
||||
// Convert to lowercase after stemming!
|
||||
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new ElisionFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
} else {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||
result = new FrenchStemFilter(result);
|
||||
// Convert to lowercase after stemming!
|
||||
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fr;
|
|||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
@ -40,7 +41,11 @@ import java.util.Set;
|
|||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerTokenFilter
|
||||
* @deprecated Use {@link SnowballFilter} with
|
||||
* {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the
|
||||
* same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class FrenchStemFilter extends TokenFilter {
|
||||
|
||||
/**
|
||||
|
|
|
@ -25,8 +25,10 @@ package org.apache.lucene.analysis.fr;
|
|||
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
|
||||
* (French stemming algorithm) for details
|
||||
* </p>
|
||||
* @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
|
||||
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
public class FrenchStemmer {
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.hu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Hungarian.
|
||||
*/
|
||||
public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Hungarian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new HungarianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Hungarian.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.it;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Italian.
|
||||
*/
|
||||
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Italian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new ItalianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Italian.
|
||||
</body>
|
||||
</html>
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharArrayMap;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
|
||||
* with custom dictionary-based stemming.
|
||||
*/
|
||||
public final class StemmerOverrideFilter extends TokenFilter {
|
||||
private final CharArrayMap<String> dictionary;
|
||||
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
|
||||
/**
|
||||
* Create a new StemmerOverrideFilter, performing dictionary-based stemming
|
||||
* with the provided <code>dictionary</code>.
|
||||
* <p>
|
||||
* Any dictionary-stemmed terms will be marked with {@link KeywordAttribute}
|
||||
* so that they will not be stemmed with stemmers down the chain.
|
||||
* </p>
|
||||
*/
|
||||
public StemmerOverrideFilter(Version matchVersion, TokenStream input,
|
||||
Map<?,String> dictionary) {
|
||||
super(input);
|
||||
this.dictionary = dictionary instanceof CharArrayMap ?
|
||||
(CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
|
||||
String stem = dictionary.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
if (stem != null) {
|
||||
termAtt.setTermBuffer(stem);
|
||||
keywordAtt.setKeyword(true);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ngram;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
|
|
@ -20,11 +20,14 @@ package org.apache.lucene.analysis.nl;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
|
@ -33,7 +36,6 @@ import org.apache.lucene.util.Version;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -51,6 +53,17 @@ import java.util.Map;
|
|||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating DutchAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||
* LowerCaseFilter is used prior to StopFilter, and Snowball
|
||||
* stopwords are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
|
@ -60,19 +73,11 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
|||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public final static String[] DUTCH_STOP_WORDS =
|
||||
{
|
||||
"de", "en", "van", "ik", "te", "dat", "die", "in", "een",
|
||||
"hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
|
||||
"er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
|
||||
"door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
|
||||
"haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
|
||||
"me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
|
||||
"veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
|
||||
"alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
|
||||
"wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
|
||||
"uw", "iemand", "geweest", "andere"
|
||||
};
|
||||
public final static String[] DUTCH_STOP_WORDS = getDefaultStopSet().toArray(new String[0]);
|
||||
|
||||
/** File containing default Dutch stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
|
@ -82,9 +87,18 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
|||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(DUTCH_STOP_WORDS), false));
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -223,18 +237,32 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
|||
* text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||
* and {@link DutchStemFilter}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||
* {@link StemmerOverrideFilter}, and {@link SnowballFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader aReader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||
result = new DutchStemFilter(result, stemdict);
|
||||
return new TokenStreamComponents(source, result);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||
if (!stemdict.isEmpty())
|
||||
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
} else {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||
result = new DutchStemFilter(result, stemdict);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
@ -42,7 +43,11 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerTokenFilter
|
||||
* @deprecated Use {@link SnowballFilter} with
|
||||
* {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the
|
||||
* same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class DutchStemFilter extends TokenFilter {
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
|
|
|
@ -26,8 +26,10 @@ import java.util.Map;
|
|||
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
|
||||
* algorithm in Martin Porter's snowball project.
|
||||
* </p>
|
||||
* @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
|
||||
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
public class DutchStemmer {
|
||||
/**
|
||||
* Buffer for the terms while stemming them.
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Norwegian.
|
||||
*/
|
||||
public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Norwegian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new NorwegianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Norwegian.
|
||||
</body>
|
||||
</html>
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.payloads;
|
|||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.payloads;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.payloads;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/** Set the positionIncrement of all tokens to the "positionIncrement",
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.pt;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Portuguese.
|
||||
*/
|
||||
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Portuguese stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new PortugueseStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Portuguese.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,133 @@
|
|||
package org.apache.lucene.analysis.ro;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.RomanianStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Romanian.
|
||||
*/
|
||||
public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Romanian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
/**
|
||||
* The comment character in the stopwords file.
|
||||
* All lines prefixed with this will be ignored.
|
||||
*/
|
||||
private static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, RomanianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new RomanianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Romanian.
|
||||
</body>
|
||||
</html>
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
@ -26,11 +27,15 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -40,13 +45,22 @@ import org.apache.lucene.util.Version;
|
|||
* will not be indexed at all).
|
||||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
* </p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating RussianAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StandardTokenizer is used, Snowball stemming is done with
|
||||
* SnowballFilter, and Snowball stopwords are used by default.
|
||||
* </ul>
|
||||
*/
|
||||
public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||
{
|
||||
/**
|
||||
* List of typical Russian stopwords.
|
||||
* List of typical Russian stopwords. (for backwards compatibility)
|
||||
* @deprecated Remove this for LUCENE 4.0
|
||||
*/
|
||||
private static final String[] RUSSIAN_STOP_WORDS = {
|
||||
@Deprecated
|
||||
private static final String[] RUSSIAN_STOP_WORDS_30 = {
|
||||
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
|
||||
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
|
||||
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
|
||||
|
@ -59,10 +73,27 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
|
||||
};
|
||||
|
||||
/** File containing default Russian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
/** @deprecated remove this for Lucene 4.0 */
|
||||
@Deprecated
|
||||
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(RUSSIAN_STOP_WORDS), false));
|
||||
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
@ -77,7 +108,9 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
}
|
||||
|
||||
public RussianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
this(matchVersion,
|
||||
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||
: DefaultSetHolder.DEFAULT_STOP_SET_30);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -132,19 +165,30 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
* provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link RussianLetterTokenizer} filtered with
|
||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* and {@link RussianStemFilter}
|
||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||
* and {@link SnowballFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
return new TokenStreamComponents(source, new RussianStemFilter(result));
|
||||
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
|
||||
result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
} else {
|
||||
final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
|
||||
result, stemExclusionSet);
|
||||
return new TokenStreamComponents(source, new RussianStemFilter(result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
||||
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -35,8 +36,11 @@ import org.apache.lucene.util.Version;
|
|||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* </ul>
|
||||
* @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
|
||||
* This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class RussianLetterTokenizer extends CharTokenizer
|
||||
{
|
||||
private static final int DIGIT_0 = '0';
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -40,7 +41,11 @@ import java.io.IOException;
|
|||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerTokenFilter
|
||||
* @deprecated Use {@link SnowballFilter} with
|
||||
* {@link org.tartarus.snowball.ext.RussianStemmer} instead, which has the
|
||||
* same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class RussianStemFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
|
|
|
@ -19,7 +19,10 @@ package org.apache.lucene.analysis.ru;
|
|||
|
||||
/**
|
||||
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
|
||||
* @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
|
||||
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
class RussianStemmer
|
||||
{
|
||||
// positions of RV, R1 and R2 respectively
|
||||
|
|
|
@ -39,7 +39,10 @@ import java.util.Set;
|
|||
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
|
||||
* </ul>
|
||||
* </p>
|
||||
* @deprecated Use the language-specific analyzer in contrib/analyzers instead.
|
||||
* This analyzer will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
private Set<?> stopSet;
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
|
||||
|
@ -39,14 +40,14 @@ import org.tartarus.snowball.SnowballProgram;
|
|||
*/
|
||||
public final class SnowballFilter extends TokenFilter {
|
||||
|
||||
private SnowballProgram stemmer;
|
||||
private final SnowballProgram stemmer;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
||||
super(input);
|
||||
this.stemmer = stemmer;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -67,23 +68,24 @@ public final class SnowballFilter extends TokenFilter {
|
|||
} catch (Exception e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/** Returns the next input Token, after being stemmed */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char termBuffer[] = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
stemmer.setCurrent(termBuffer, length);
|
||||
stemmer.stem();
|
||||
final char finalTerm[] = stemmer.getCurrentBuffer();
|
||||
final int newLength = stemmer.getCurrentBufferLength();
|
||||
if (finalTerm != termBuffer)
|
||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
||||
else
|
||||
termAtt.setTermLength(newLength);
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
char termBuffer[] = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
stemmer.setCurrent(termBuffer, length);
|
||||
stemmer.stem();
|
||||
final char finalTerm[] = stemmer.getCurrentBuffer();
|
||||
final int newLength = stemmer.getCurrentBufferLength();
|
||||
if (finalTerm != termBuffer)
|
||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
||||
else
|
||||
termAtt.setTermLength(newLength);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.sv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.SwedishStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Swedish.
|
||||
*/
|
||||
public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Swedish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public SwedishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new SwedishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Swedish.
|
||||
</body>
|
||||
</html>
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.th;
|
|||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
import java.lang.Character.UnicodeBlock;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
package org.apache.lucene.analysis.tr;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.TurkishStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Turkish.
|
||||
*/
|
||||
public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
||||
/** File containing default Turkish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
/**
|
||||
* The comment character in the stopwords file.
|
||||
* All lines prefixed with this will be ignored.
|
||||
*/
|
||||
private static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, TurkishAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public TurkishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new TurkishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
|
@ -17,15 +17,6 @@
|
|||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Support for Turkish.
|
||||
<p>
|
||||
This package contains just the TokenStream for handling turkish casing,
|
||||
for a stemmer please see the snowball package.
|
||||
</p>
|
||||
<p>
|
||||
WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the
|
||||
language is set to Turkish, so you will need to construct your own
|
||||
analyzer that combines TurkishLowerCaseFilter and SnowballFilter.
|
||||
</p>
|
||||
Analyzer for Turkish.
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
|
|
@ -0,0 +1,233 @@
|
|||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
acea
|
||||
aceasta
|
||||
această
|
||||
aceea
|
||||
acei
|
||||
aceia
|
||||
acel
|
||||
acela
|
||||
acele
|
||||
acelea
|
||||
acest
|
||||
acesta
|
||||
aceste
|
||||
acestea
|
||||
aceşti
|
||||
aceştia
|
||||
acolo
|
||||
acum
|
||||
ai
|
||||
aia
|
||||
aibă
|
||||
aici
|
||||
al
|
||||
ăla
|
||||
ale
|
||||
alea
|
||||
ălea
|
||||
altceva
|
||||
altcineva
|
||||
am
|
||||
ar
|
||||
are
|
||||
aş
|
||||
aşadar
|
||||
asemenea
|
||||
asta
|
||||
ăsta
|
||||
astăzi
|
||||
astea
|
||||
ăstea
|
||||
ăştia
|
||||
asupra
|
||||
aţi
|
||||
au
|
||||
avea
|
||||
avem
|
||||
aveţi
|
||||
azi
|
||||
bine
|
||||
bucur
|
||||
bună
|
||||
ca
|
||||
că
|
||||
căci
|
||||
când
|
||||
care
|
||||
cărei
|
||||
căror
|
||||
cărui
|
||||
cât
|
||||
câte
|
||||
câţi
|
||||
către
|
||||
câtva
|
||||
ce
|
||||
cel
|
||||
ceva
|
||||
chiar
|
||||
cînd
|
||||
cine
|
||||
cineva
|
||||
cît
|
||||
cîte
|
||||
cîţi
|
||||
cîtva
|
||||
contra
|
||||
cu
|
||||
cum
|
||||
cumva
|
||||
curând
|
||||
curînd
|
||||
da
|
||||
dă
|
||||
dacă
|
||||
dar
|
||||
datorită
|
||||
de
|
||||
deci
|
||||
deja
|
||||
deoarece
|
||||
departe
|
||||
deşi
|
||||
din
|
||||
dinaintea
|
||||
dintr
|
||||
dintre
|
||||
drept
|
||||
după
|
||||
ea
|
||||
ei
|
||||
el
|
||||
ele
|
||||
eram
|
||||
este
|
||||
eşti
|
||||
eu
|
||||
face
|
||||
fără
|
||||
fi
|
||||
fie
|
||||
fiecare
|
||||
fii
|
||||
fim
|
||||
fiţi
|
||||
iar
|
||||
ieri
|
||||
îi
|
||||
îl
|
||||
îmi
|
||||
împotriva
|
||||
în
|
||||
înainte
|
||||
înaintea
|
||||
încât
|
||||
încît
|
||||
încotro
|
||||
între
|
||||
întrucât
|
||||
întrucît
|
||||
îţi
|
||||
la
|
||||
lângă
|
||||
le
|
||||
li
|
||||
lîngă
|
||||
lor
|
||||
lui
|
||||
mă
|
||||
mâine
|
||||
mea
|
||||
mei
|
||||
mele
|
||||
mereu
|
||||
meu
|
||||
mi
|
||||
mine
|
||||
mult
|
||||
multă
|
||||
mulţi
|
||||
ne
|
||||
nicăieri
|
||||
nici
|
||||
nimeni
|
||||
nişte
|
||||
noastră
|
||||
noastre
|
||||
noi
|
||||
noştri
|
||||
nostru
|
||||
nu
|
||||
ori
|
||||
oricând
|
||||
oricare
|
||||
oricât
|
||||
orice
|
||||
oricînd
|
||||
oricine
|
||||
oricît
|
||||
oricum
|
||||
oriunde
|
||||
până
|
||||
pe
|
||||
pentru
|
||||
peste
|
||||
pînă
|
||||
poate
|
||||
pot
|
||||
prea
|
||||
prima
|
||||
primul
|
||||
prin
|
||||
printr
|
||||
sa
|
||||
să
|
||||
săi
|
||||
sale
|
||||
sau
|
||||
său
|
||||
se
|
||||
şi
|
||||
sînt
|
||||
sîntem
|
||||
sînteţi
|
||||
spre
|
||||
sub
|
||||
sunt
|
||||
suntem
|
||||
sunteţi
|
||||
ta
|
||||
tăi
|
||||
tale
|
||||
tău
|
||||
te
|
||||
ţi
|
||||
ţie
|
||||
tine
|
||||
toată
|
||||
toate
|
||||
tot
|
||||
toţi
|
||||
totuşi
|
||||
tu
|
||||
un
|
||||
una
|
||||
unde
|
||||
undeva
|
||||
unei
|
||||
unele
|
||||
uneori
|
||||
unor
|
||||
vă
|
||||
vi
|
||||
voastră
|
||||
voastre
|
||||
voi
|
||||
voştri
|
||||
vostru
|
||||
vouă
|
||||
vreo
|
||||
vreun
|
|
@ -0,0 +1,212 @@
|
|||
# Turkish stopwords from LUCENE-559
|
||||
# merged with the list from "Information Retrieval on Turkish Texts"
|
||||
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
|
||||
acaba
|
||||
altmış
|
||||
altı
|
||||
ama
|
||||
ancak
|
||||
arada
|
||||
aslında
|
||||
ayrıca
|
||||
bana
|
||||
bazı
|
||||
belki
|
||||
ben
|
||||
benden
|
||||
beni
|
||||
benim
|
||||
beri
|
||||
beş
|
||||
bile
|
||||
bin
|
||||
bir
|
||||
birçok
|
||||
biri
|
||||
birkaç
|
||||
birkez
|
||||
birşey
|
||||
birşeyi
|
||||
biz
|
||||
bize
|
||||
bizden
|
||||
bizi
|
||||
bizim
|
||||
böyle
|
||||
böylece
|
||||
bu
|
||||
buna
|
||||
bunda
|
||||
bundan
|
||||
bunlar
|
||||
bunları
|
||||
bunların
|
||||
bunu
|
||||
bunun
|
||||
burada
|
||||
çok
|
||||
çünkü
|
||||
da
|
||||
daha
|
||||
dahi
|
||||
de
|
||||
defa
|
||||
değil
|
||||
diğer
|
||||
diye
|
||||
doksan
|
||||
dokuz
|
||||
dolayı
|
||||
dolayısıyla
|
||||
dört
|
||||
edecek
|
||||
eden
|
||||
ederek
|
||||
edilecek
|
||||
ediliyor
|
||||
edilmesi
|
||||
ediyor
|
||||
eğer
|
||||
elli
|
||||
en
|
||||
etmesi
|
||||
etti
|
||||
ettiği
|
||||
ettiğini
|
||||
gibi
|
||||
göre
|
||||
halen
|
||||
hangi
|
||||
hatta
|
||||
hem
|
||||
henüz
|
||||
hep
|
||||
hepsi
|
||||
her
|
||||
herhangi
|
||||
herkesin
|
||||
hiç
|
||||
hiçbir
|
||||
için
|
||||
iki
|
||||
ile
|
||||
ilgili
|
||||
ise
|
||||
işte
|
||||
itibaren
|
||||
itibariyle
|
||||
kadar
|
||||
karşın
|
||||
katrilyon
|
||||
kendi
|
||||
kendilerine
|
||||
kendini
|
||||
kendisi
|
||||
kendisine
|
||||
kendisini
|
||||
kez
|
||||
ki
|
||||
kim
|
||||
kimden
|
||||
kime
|
||||
kimi
|
||||
kimse
|
||||
kırk
|
||||
milyar
|
||||
milyon
|
||||
mu
|
||||
mü
|
||||
mı
|
||||
nasıl
|
||||
ne
|
||||
neden
|
||||
nedenle
|
||||
nerde
|
||||
nerede
|
||||
nereye
|
||||
niye
|
||||
niçin
|
||||
o
|
||||
olan
|
||||
olarak
|
||||
oldu
|
||||
olduğu
|
||||
olduğunu
|
||||
olduklarını
|
||||
olmadı
|
||||
olmadığı
|
||||
olmak
|
||||
olması
|
||||
olmayan
|
||||
olmaz
|
||||
olsa
|
||||
olsun
|
||||
olup
|
||||
olur
|
||||
olursa
|
||||
oluyor
|
||||
on
|
||||
ona
|
||||
ondan
|
||||
onlar
|
||||
onlardan
|
||||
onları
|
||||
onların
|
||||
onu
|
||||
onun
|
||||
otuz
|
||||
oysa
|
||||
öyle
|
||||
pek
|
||||
rağmen
|
||||
sadece
|
||||
sanki
|
||||
sekiz
|
||||
seksen
|
||||
sen
|
||||
senden
|
||||
seni
|
||||
senin
|
||||
siz
|
||||
sizden
|
||||
sizi
|
||||
sizin
|
||||
şey
|
||||
şeyden
|
||||
şeyi
|
||||
şeyler
|
||||
şöyle
|
||||
şu
|
||||
şuna
|
||||
şunda
|
||||
şundan
|
||||
şunları
|
||||
şunu
|
||||
tarafından
|
||||
trilyon
|
||||
tüm
|
||||
üç
|
||||
üzere
|
||||
var
|
||||
vardı
|
||||
ve
|
||||
veya
|
||||
ya
|
||||
yani
|
||||
yapacak
|
||||
yapılan
|
||||
yapılması
|
||||
yapıyor
|
||||
yapmak
|
||||
yaptı
|
||||
yaptığı
|
||||
yaptığını
|
||||
yaptıkları
|
||||
yedi
|
||||
yerine
|
||||
yetmiş
|
||||
yine
|
||||
yirmi
|
||||
yoksa
|
||||
yüz
|
||||
zaten
|
|
@ -22,8 +22,6 @@ import java.util.Collections;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.print.DocFlavor.CHAR_ARRAY;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.da;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new DanishAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||
checkOneTermReuse(a, "undersøgelse", "undersøg");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "på", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("undersøgelse");
|
||||
Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT,
|
||||
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkOneTermReuse(a, "Tisch", "tisch");
|
||||
checkOneTermReuse(a, "Tische", "tisch");
|
||||
checkOneTermReuse(a, "Tischen", "tisch");
|
||||
}
|
||||
|
||||
public void testExclusionTableBWCompat() throws IOException {
|
||||
GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader("Fischen Trinken")));
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
filter.setExclusionSet(set);
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||
}
|
||||
|
||||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
GermanStemFilter filter = new GermanStemFilter(
|
||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
||||
"Fischen Trinken")), set));
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||
}
|
||||
|
||||
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set1.add("trinken");
|
||||
set1.add("fischen");
|
||||
GermanStemFilter filter = new GermanStemFilter(
|
||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
||||
"Fischen Trinken")), set));
|
||||
filter.setExclusionSet(set1);
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testExclusionTableReuse() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkOneTermReuse(a, "tischen", "tisch");
|
||||
a.setStemExclusionTable(new String[] { "tischen" });
|
||||
checkOneTermReuse(a, "tischen", "tischen");
|
||||
}
|
||||
|
||||
/** test some features of the new snowball filter
|
||||
* these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
|
||||
*/
|
||||
public void testGermanSpecials() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||
// a/o/u + e is equivalent to the umlaut form
|
||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
|
||||
// here they are with the old stemmer
|
||||
a = new GermanAnalyzer(Version.LUCENE_30);
|
||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
|
||||
}
|
||||
}
|
|
@ -20,15 +20,14 @@ package org.apache.lucene.analysis.de;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -40,6 +39,8 @@ import org.apache.lucene.util.Version;
|
|||
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testStemming() throws Exception {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
||||
TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
|
||||
// read test cases from external file:
|
||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
||||
|
@ -55,68 +56,12 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
|||
continue; // ignore comments and empty lines
|
||||
String[] parts = line.split(";");
|
||||
//System.out.println(parts[0] + " -- " + parts[1]);
|
||||
check(parts[0], parts[1]);
|
||||
tokenizer.reset(new StringReader(parts[0]));
|
||||
filter.reset();
|
||||
assertTokenStreamContents(filter, new String[] { parts[1] });
|
||||
}
|
||||
breader.close();
|
||||
isr.close();
|
||||
fis.close();
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkReuse(a, "Tisch", "tisch");
|
||||
checkReuse(a, "Tische", "tisch");
|
||||
checkReuse(a, "Tischen", "tisch");
|
||||
}
|
||||
|
||||
public void testExclusionTableBWCompat() throws IOException {
|
||||
GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
|
||||
new StringReader("Fischen Trinken")));
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
filter.setExclusionSet(set);
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||
}
|
||||
|
||||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
GermanStemFilter filter = new GermanStemFilter(
|
||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
||||
"Fischen Trinken")), set));
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||
}
|
||||
|
||||
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||
set1.add("trinken");
|
||||
set1.add("fischen");
|
||||
GermanStemFilter filter = new GermanStemFilter(
|
||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
||||
"Fischen Trinken")), set));
|
||||
filter.setExclusionSet(set1);
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
*/
|
||||
public void testExclusionTableReuse() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkReuse(a, "tischen", "tisch");
|
||||
a.setStemExclusionTable(new String[] { "tischen" });
|
||||
checkReuse(a, "tischen", "tischen");
|
||||
}
|
||||
|
||||
|
||||
private void check(final String input, final String expected) throws Exception {
|
||||
checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
||||
}
|
||||
|
||||
private void checkReuse(Analyzer a, String input, String expected) throws Exception {
|
||||
checkOneTermReuse(a, input, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
|
|||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -63,4 +62,23 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
|
||||
* check that this is preserved.
|
||||
* @deprecated remove this test in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testAcronymBWCompat() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
|
||||
}
|
||||
|
||||
/**
|
||||
* test that acronym normalization works
|
||||
*/
|
||||
public void testAcronym() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.en;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new EnglishAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "books", "book");
|
||||
checkOneTermReuse(a, "book", "book");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "the", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("books");
|
||||
Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT,
|
||||
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "books", "books");
|
||||
checkOneTermReuse(a, "book", "book");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.es;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new SpanishAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "chicana", "chican");
|
||||
checkOneTermReuse(a, "chicano", "chican");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "los", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("chicano");
|
||||
Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT,
|
||||
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "chicana", "chican");
|
||||
checkOneTermReuse(a, "chicano", "chicano");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.fi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new FinnishAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||
checkOneTermReuse(a, "edeltäjistään", "edeltäj");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "olla", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("edeltäjistään");
|
||||
Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT,
|
||||
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
||||
}
|
||||
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.fr;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -113,6 +115,94 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated remove this test for Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testAnalyzer30() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
|
||||
|
||||
assertAnalyzesTo(fa, "", new String[] {
|
||||
});
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien chat cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien CHAT CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
" chien ,? + = - CHAT /: > CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"mot \"entreguillemet\"",
|
||||
new String[] { "mot", "entreguillemet" });
|
||||
|
||||
// let's do some french specific tests now
|
||||
|
||||
/* 1. couldn't resist
|
||||
I would expect this to stay one term as in French the minus
|
||||
sign is often used for composing words */
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"Jean-François",
|
||||
new String[] { "jean", "françois" });
|
||||
|
||||
// 2. stopwords
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
// some nouns and adjectives
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"lances chismes habitable chiste éléments captifs",
|
||||
new String[] {
|
||||
"lanc",
|
||||
"chism",
|
||||
"habit",
|
||||
"chist",
|
||||
"élément",
|
||||
"captif" });
|
||||
|
||||
// some verbs
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"finissions souffrirent rugissante",
|
||||
new String[] { "fin", "souffr", "rug" });
|
||||
|
||||
// some everything else
|
||||
// aujourd'hui stays one term which is OK
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
|
||||
new String[] {
|
||||
"c3po",
|
||||
"aujourd'hui",
|
||||
"oeuf",
|
||||
"ïâöûàä",
|
||||
"anticonstitutionnel",
|
||||
"jav" });
|
||||
|
||||
// some more everything else
|
||||
// here 1940-1945 stays as one term, 1940:1945 not ?
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||||
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
|
||||
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stopwords
|
||||
|
@ -157,4 +247,28 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
|
||||
"chist" });
|
||||
}
|
||||
|
||||
public void testElision() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||
* stopwords were case sensitive. Preserve this for back compat.
|
||||
* @deprecated Remove this test in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyStopwordsCasing() throws IOException {
|
||||
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(a, "Votre", new String[] { "votr" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that stopwords are not case sensitive
|
||||
*/
|
||||
public void testStopwordsCasing() throws IOException {
|
||||
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(a, "Votre", new String[] { });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.hu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new HungarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "babakocsi", "babakocs");
|
||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "által", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("babakocsi");
|
||||
Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT,
|
||||
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.it;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new ItalianAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "abbandonata", "abbandon");
|
||||
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "dallo", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("abbandonata");
|
||||
Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT,
|
||||
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
||||
public void testOverride() throws IOException {
|
||||
// lets make booked stem to books
|
||||
// the override filter will convert "booked" to "books",
|
||||
// but also mark it with KeywordAttribute so Porter will not change it.
|
||||
Map<String,String> dictionary = new HashMap<String,String>();
|
||||
dictionary.put("booked", "books");
|
||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
||||
TokenStream stream = new PorterStemFilter(
|
||||
new StemmerOverrideFilter(Version.LUCENE_CURRENT, tokenizer, dictionary));
|
||||
assertTokenStreamContents(stream, new String[] { "books" });
|
||||
}
|
||||
}
|
|
@ -22,7 +22,6 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,10 +18,8 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
/**
|
||||
|
|
|
@ -100,9 +100,6 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
check("ophalend", "ophal");
|
||||
check("ophalers", "ophaler");
|
||||
check("ophef", "ophef");
|
||||
check("opheffen", "ophef"); // versus snowball 'opheff'
|
||||
check("opheffende", "ophef"); // versus snowball 'opheff'
|
||||
check("opheffing", "ophef"); // versus snowball 'opheff'
|
||||
check("opheldering", "ophelder");
|
||||
check("ophemelde", "ophemeld");
|
||||
check("ophemelen", "ophemel");
|
||||
|
@ -118,6 +115,24 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
check("ophouden", "ophoud");
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated remove this test in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testOldBuggyStemmer() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
|
||||
checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
|
||||
checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
|
||||
checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
|
||||
}
|
||||
|
||||
public void testSnowballCorrectness() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkOneTermReuse(a, "opheffen", "opheff");
|
||||
checkOneTermReuse(a, "opheffende", "opheff");
|
||||
checkOneTermReuse(a, "opheffing", "opheff");
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
||||
|
@ -161,6 +176,25 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||
* stopwords were case sensitive. Preserve this for back compat.
|
||||
* @deprecated Remove this test in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyStopwordsCasing() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that stopwords are not case sensitive
|
||||
*/
|
||||
public void testStopwordsCasing() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(a, "Zelf", new String[] { });
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws Exception {
|
||||
checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new NorwegianAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
|
||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "det", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("havnedistriktene");
|
||||
Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT,
|
||||
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.pt;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new PortugueseAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "quilométricas", "quilométr");
|
||||
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "não", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("quilométricas");
|
||||
Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT,
|
||||
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.ro;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new RomanianAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "absenţa", "absenţ");
|
||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "îl", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("absenţa");
|
||||
Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT,
|
||||
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||
}
|
||||
}
|
|
@ -50,9 +50,14 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
|||
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
}
|
||||
|
||||
public void testUnicode() throws IOException
|
||||
/**
|
||||
* @deprecated remove this test and its datafiles in Lucene 4.0
|
||||
* the Snowball version has its own data tests.
|
||||
*/
|
||||
@Deprecated
|
||||
public void testUnicode30() throws IOException
|
||||
{
|
||||
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
||||
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30);
|
||||
inWords =
|
||||
new InputStreamReader(
|
||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
|
||||
|
@ -110,12 +115,22 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
|||
}
|
||||
}
|
||||
|
||||
/** @deprecated remove this test in Lucene 4.0: stopwords changed */
|
||||
@Deprecated
|
||||
public void testReusableTokenStream30() throws Exception {
|
||||
Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||
new String[] { "знан", "хран", "тайн" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||
new String[] { "знан", "хран", "тайн" });
|
||||
new String[] { "знан", "эт", "хран", "тайн" });
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
/**
|
||||
* Testcase for {@link RussianLetterTokenizer}
|
||||
* @deprecated Remove this test class in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testRussianLetterTokenizer() throws IOException {
|
||||
|
|
|
@ -24,6 +24,10 @@ import java.io.InputStreamReader;
|
|||
import java.io.FileInputStream;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestRussianStem extends LuceneTestCase
|
||||
{
|
||||
private ArrayList words = new ArrayList();
|
||||
|
|
|
@ -22,11 +22,8 @@ import java.io.StringReader;
|
|||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.sv;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new SwedishAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
|
||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "och", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("jaktkarlarne");
|
||||
Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT,
|
||||
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.analysis.tr;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
* stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new TurkishAnalyzer(Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
/** test stopwords and stemming */
|
||||
public void testBasics() throws IOException {
|
||||
Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT);
|
||||
// stemming
|
||||
checkOneTermReuse(a, "ağacı", "ağaç");
|
||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||
// stopword
|
||||
assertAnalyzesTo(a, "dolayı", new String[] {});
|
||||
}
|
||||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("ağacı");
|
||||
Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT,
|
||||
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue