mirror of https://github.com/apache/lucene.git
LUCENE-2055: better snowball integration, deprecate buggy handcoded snowball impls, restructure lang support
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@907125 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
57d1387492
commit
a6b7c5552b
|
@ -23,6 +23,11 @@ stopword list that is BSD-licensed created by Jacques Savoy. The file resides i
|
||||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
|
The Romanian analyzer (contrib/analyzers) comes with a default
|
||||||
|
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||||
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt.
|
||||||
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The Bulgarian analyzer (contrib/analyzers) comes with a default
|
The Bulgarian analyzer (contrib/analyzers) comes with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
||||||
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
||||||
|
|
|
@ -27,6 +27,10 @@ Changes in runtime behavior
|
||||||
used with Version > 3.0 and the TurkishStemmer.
|
used with Version > 3.0 and the TurkishStemmer.
|
||||||
(Robert Muir via Simon Willnauer)
|
(Robert Muir via Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-2055: GermanAnalyzer now uses the Snowball German2 algorithm and
|
||||||
|
stopwords list by default for Version > 3.0.
|
||||||
|
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
|
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
|
||||||
|
@ -54,6 +58,13 @@ Bug fixes
|
||||||
CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
|
CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
|
||||||
and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
|
and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2055: Deprecated RussianTokenizer, RussianStemmer, RussianStemFilter,
|
||||||
|
FrenchStemmer, FrenchStemFilter, DutchStemmer, and DutchStemFilter. For
|
||||||
|
these Analyzers, SnowballFilter is used instead (for Version > 3.0), as
|
||||||
|
the previous code did not always implement the Snowball algorithm correctly.
|
||||||
|
Additionally, for Version > 3.0, the Snowball stopword lists are used by
|
||||||
|
default. (Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-2108: Add SpellChecker.close, to close the underlying
|
* LUCENE-2108: Add SpellChecker.close, to close the underlying
|
||||||
|
@ -69,6 +80,12 @@ API Changes
|
||||||
* LUCENE-2204: Change some package private classes/members to publicly accessible to implement
|
* LUCENE-2204: Change some package private classes/members to publicly accessible to implement
|
||||||
custom FragmentsBuilders. (Koji Sekiguchi)
|
custom FragmentsBuilders. (Koji Sekiguchi)
|
||||||
|
|
||||||
|
* LUCENE-2055: Integrate snowball into contrib/analyzers. SnowballAnalyzer is
|
||||||
|
now deprecated in favor of language-specific analyzers which contain things
|
||||||
|
such as stopword lists and any language-specific processing in addition to
|
||||||
|
stemming. Add Turkish and Romanian stopwords lists to support this.
|
||||||
|
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
* LUCENE-2102: Add a Turkish LowerCase Filter. TurkishLowerCaseFilter handles
|
* LUCENE-2102: Add a Turkish LowerCase Filter. TurkishLowerCaseFilter handles
|
||||||
|
@ -105,6 +122,10 @@ New features
|
||||||
|
|
||||||
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
|
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2055: Add analyzers/misc/StemmerOverrideFilter. This filter provides
|
||||||
|
the ability to override any stemmer with a custom dictionary map.
|
||||||
|
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.da;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.DanishStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Danish.
|
||||||
|
*/
|
||||||
|
public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Danish stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public DanishAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public DanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new DanishStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Danish.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -36,10 +36,12 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.German2Stemmer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link Analyzer} for German language.
|
* {@link Analyzer} for German language.
|
||||||
|
@ -51,6 +53,16 @@ import org.apache.lucene.util.Version;
|
||||||
* exclusion list is empty by default.
|
* exclusion list is empty by default.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
|
* <a name="version"/>
|
||||||
|
* <p>You must specify the required {@link Version}
|
||||||
|
* compatibility when creating GermanAnalyzer:
|
||||||
|
* <ul>
|
||||||
|
* <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
|
||||||
|
* Snowball stopwords are used by default.
|
||||||
|
* <li> As of 2.9, StopFilter preserves position
|
||||||
|
* increments
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||||
*/
|
*/
|
||||||
|
@ -60,7 +72,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* List of typical german stopwords.
|
* List of typical german stopwords.
|
||||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||||
*/
|
*/
|
||||||
//TODO make this private in 3.1
|
//TODO make this private in 3.1, remove in 4.0
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public final static String[] GERMAN_STOP_WORDS = {
|
public final static String[] GERMAN_STOP_WORDS = {
|
||||||
"einer", "eine", "eines", "einem", "einen",
|
"einer", "eine", "eines", "einem", "einen",
|
||||||
|
@ -77,6 +89,9 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
"durch", "wegen", "wird"
|
"durch", "wegen", "wird"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** File containing default German stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a set of default German-stopwords
|
* Returns a set of default German-stopwords
|
||||||
* @return a set of default German-stopwords
|
* @return a set of default German-stopwords
|
||||||
|
@ -86,8 +101,21 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
/** @deprecated remove in Lucene 4.0 */
|
||||||
|
@Deprecated
|
||||||
|
private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||||
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||||
|
private static final Set<?> DEFAULT_SET;
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_SET =
|
||||||
|
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -105,7 +133,9 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* {@link #getDefaultStopSet()}.
|
* {@link #getDefaultStopSet()}.
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer(Version matchVersion) {
|
public GermanAnalyzer(Version matchVersion) {
|
||||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
this(matchVersion,
|
||||||
|
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
|
||||||
|
: DefaultSetHolder.DEFAULT_SET_30);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -199,8 +229,9 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link TokenStreamComponents} built from a
|
||||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter}, and
|
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||||
* {@link GermanStemFilter}
|
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, and
|
||||||
|
* {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
@ -210,6 +241,10 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
result = new LowerCaseFilter(matchVersion, result);
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
result = new StopFilter( matchVersion, result, stopwords);
|
result = new StopFilter( matchVersion, result, stopwords);
|
||||||
result = new KeywordMarkerTokenFilter(result, exclusionSet);
|
result = new KeywordMarkerTokenFilter(result, exclusionSet);
|
||||||
return new TokenStreamComponents(source, new GermanStemFilter(result));
|
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||||
|
result = new SnowballFilter(result, new German2Stemmer());
|
||||||
|
else
|
||||||
|
result = new GermanStemFilter(result);
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -41,6 +42,15 @@ import java.util.Set;
|
||||||
* A default set of stopwords is used unless an alternative list is specified.
|
* A default set of stopwords is used unless an alternative list is specified.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
|
* <a name="version"/>
|
||||||
|
* <p>You must specify the required {@link Version}
|
||||||
|
* compatibility when creating GreekAnalyzer:
|
||||||
|
* <ul>
|
||||||
|
* <li> As of 3.1, StandardFilter is used by default.
|
||||||
|
* <li> As of 2.9, StopFilter preserves position
|
||||||
|
* increments
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||||
*/
|
*/
|
||||||
|
@ -117,13 +127,15 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link TokenStreamComponents} built from a
|
||||||
* {@link StandardTokenizer} filtered with
|
* {@link StandardTokenizer} filtered with
|
||||||
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
* {@link GreekLowerCaseFilter}, {@link StandardFilter} and {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
final TokenStream result = new GreekLowerCaseFilter(source);
|
TokenStream result = new GreekLowerCaseFilter(source);
|
||||||
|
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||||
|
result = new StandardFilter(result);
|
||||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
package org.apache.lucene.analysis.en;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.PorterStemFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for English.
|
||||||
|
*/
|
||||||
|
public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
|
||||||
|
*/
|
||||||
|
public EnglishAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link PorterStemFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new PorterStemFilter(result);
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for English.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Spanish.
|
||||||
|
*/
|
||||||
|
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Spanish stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public SpanishAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new SpanishStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Spanish.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.fi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Finnish.
|
||||||
|
*/
|
||||||
|
public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Italian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public FinnishAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new FinnishStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Finnish.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -68,7 +68,7 @@ public final class ElisionFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* Constructs an elision filter with standard stop words
|
* Constructs an elision filter with standard stop words
|
||||||
*/
|
*/
|
||||||
protected ElisionFilter(Version matchVersion, TokenStream input) {
|
public ElisionFilter(Version matchVersion, TokenStream input) {
|
||||||
this(matchVersion, input, DEFAULT_ARTICLES);
|
this(matchVersion, input, DEFAULT_ARTICLES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ public final class ElisionFilter extends TokenFilter {
|
||||||
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
|
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
protected ElisionFilter(TokenStream input) {
|
public ElisionFilter(TokenStream input) {
|
||||||
this(Version.LUCENE_30, input);
|
this(Version.LUCENE_30, input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||||
|
@ -55,6 +56,9 @@ import java.util.Set;
|
||||||
* <p>You must specify the required {@link Version}
|
* <p>You must specify the required {@link Version}
|
||||||
* compatibility when creating FrenchAnalyzer:
|
* compatibility when creating FrenchAnalyzer:
|
||||||
* <ul>
|
* <ul>
|
||||||
|
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||||
|
* LowerCaseFilter is used prior to StopFilter, and ElisionFilter and
|
||||||
|
* Snowball stopwords are used by default.
|
||||||
* <li> As of 2.9, StopFilter preserves position
|
* <li> As of 2.9, StopFilter preserves position
|
||||||
* increments
|
* increments
|
||||||
* </ul>
|
* </ul>
|
||||||
|
@ -68,7 +72,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Extended list of typical French stopwords.
|
* Extended list of typical French stopwords.
|
||||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||||
*/
|
*/
|
||||||
// TODO make this private in 3.1
|
// TODO make this private in 3.1, remove in 4.0
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public final static String[] FRENCH_STOP_WORDS = {
|
public final static String[] FRENCH_STOP_WORDS = {
|
||||||
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
|
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
|
||||||
|
@ -95,6 +99,9 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
"été", "être", "ô"
|
"été", "être", "ô"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** File containing default French stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains words that should be indexed but not stemmed.
|
* Contains words that should be indexed but not stemmed.
|
||||||
*/
|
*/
|
||||||
|
@ -110,16 +117,31 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
/** @deprecated remove this in Lucene 4.0 */
|
||||||
|
@Deprecated
|
||||||
|
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
||||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
||||||
false));
|
false));
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET =
|
||||||
|
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
|
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
|
||||||
*/
|
*/
|
||||||
public FrenchAnalyzer(Version matchVersion) {
|
public FrenchAnalyzer(Version matchVersion) {
|
||||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
this(matchVersion,
|
||||||
|
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||||
|
: DefaultSetHolder.DEFAULT_STOP_SET_30);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -207,12 +229,25 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
* {@link Reader}.
|
* {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
||||||
* filtered with {@link StandardFilter}, {@link StopFilter},
|
* filtered with {@link StandardFilter}, {@link ElisionFilter},
|
||||||
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
|
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||||
|
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||||
|
* and {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
|
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new ElisionFilter(matchVersion, result);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!excltable.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||||
|
result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
} else {
|
||||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
TokenStream result = new StandardFilter(source);
|
TokenStream result = new StandardFilter(source);
|
||||||
result = new StopFilter(matchVersion, result, stopwords);
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
@ -222,5 +257,6 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
// Convert to lowercase after stemming!
|
// Convert to lowercase after stemming!
|
||||||
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
|
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fr;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
@ -40,7 +41,11 @@ import java.util.Set;
|
||||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
* </p>
|
* </p>
|
||||||
* @see KeywordMarkerTokenFilter
|
* @see KeywordMarkerTokenFilter
|
||||||
|
* @deprecated Use {@link SnowballFilter} with
|
||||||
|
* {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the
|
||||||
|
* same functionality. This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public final class FrenchStemFilter extends TokenFilter {
|
public final class FrenchStemFilter extends TokenFilter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -25,8 +25,10 @@ package org.apache.lucene.analysis.fr;
|
||||||
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
|
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
|
||||||
* (French stemming algorithm) for details
|
* (French stemming algorithm) for details
|
||||||
* </p>
|
* </p>
|
||||||
|
* @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
|
||||||
|
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class FrenchStemmer {
|
public class FrenchStemmer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.hu;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Hungarian.
|
||||||
|
*/
|
||||||
|
public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Hungarian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public HungarianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new HungarianStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Hungarian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.it;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Italian.
|
||||||
|
*/
|
||||||
|
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Italian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public ItalianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new ItalianStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Italian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArrayMap;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
|
||||||
|
* with custom dictionary-based stemming.
|
||||||
|
*/
|
||||||
|
public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
|
private final CharArrayMap<String> dictionary;
|
||||||
|
|
||||||
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new StemmerOverrideFilter, performing dictionary-based stemming
|
||||||
|
* with the provided <code>dictionary</code>.
|
||||||
|
* <p>
|
||||||
|
* Any dictionary-stemmed terms will be marked with {@link KeywordAttribute}
|
||||||
|
* so that they will not be stemmed with stemmers down the chain.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public StemmerOverrideFilter(Version matchVersion, TokenStream input,
|
||||||
|
Map<?,String> dictionary) {
|
||||||
|
super(input);
|
||||||
|
this.dictionary = dictionary instanceof CharArrayMap ?
|
||||||
|
(CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
|
||||||
|
String stem = dictionary.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||||
|
if (stem != null) {
|
||||||
|
termAtt.setTermBuffer(stem);
|
||||||
|
keywordAtt.setKeyword(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ngram;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
|
@ -20,11 +20,14 @@ package org.apache.lucene.analysis.nl;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||||
|
@ -33,7 +36,6 @@ import org.apache.lucene.util.Version;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -51,6 +53,17 @@ import java.util.Map;
|
||||||
* exclusion list is empty by default.
|
* exclusion list is empty by default.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
|
* <a name="version"/>
|
||||||
|
* <p>You must specify the required {@link Version}
|
||||||
|
* compatibility when creating DutchAnalyzer:
|
||||||
|
* <ul>
|
||||||
|
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||||
|
* LowerCaseFilter is used prior to StopFilter, and Snowball
|
||||||
|
* stopwords are used by default.
|
||||||
|
* <li> As of 2.9, StopFilter preserves position
|
||||||
|
* increments
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||||
*/
|
*/
|
||||||
|
@ -60,19 +73,11 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
||||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public final static String[] DUTCH_STOP_WORDS =
|
public final static String[] DUTCH_STOP_WORDS = getDefaultStopSet().toArray(new String[0]);
|
||||||
{
|
|
||||||
"de", "en", "van", "ik", "te", "dat", "die", "in", "een",
|
/** File containing default Dutch stopwords. */
|
||||||
"hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
|
public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
|
||||||
"er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
|
|
||||||
"door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
|
|
||||||
"haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
|
|
||||||
"me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
|
|
||||||
"veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
|
|
||||||
"alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
|
|
||||||
"wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
|
|
||||||
"uw", "iemand", "geweest", "andere"
|
|
||||||
};
|
|
||||||
/**
|
/**
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
@ -82,9 +87,18 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
|
||||||
Arrays.asList(DUTCH_STOP_WORDS), false));
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -223,12 +237,25 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
||||||
* text in the provided {@link Reader}.
|
* text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||||
* filtered with {@link StandardFilter}, {@link StopFilter},
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
* and {@link DutchStemFilter}
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||||
|
* {@link StemmerOverrideFilter}, and {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader aReader) {
|
Reader aReader) {
|
||||||
|
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stoptable);
|
||||||
|
if (!excltable.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, excltable);
|
||||||
|
if (!stemdict.isEmpty())
|
||||||
|
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
|
||||||
|
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
} else {
|
||||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||||
TokenStream result = new StandardFilter(source);
|
TokenStream result = new StandardFilter(source);
|
||||||
result = new StopFilter(matchVersion, result, stoptable);
|
result = new StopFilter(matchVersion, result, stoptable);
|
||||||
|
@ -237,4 +264,5 @@ public final class DutchAnalyzer extends ReusableAnalyzerBase {
|
||||||
result = new DutchStemFilter(result, stemdict);
|
result = new DutchStemFilter(result, stemdict);
|
||||||
return new TokenStreamComponents(source, result);
|
return new TokenStreamComponents(source, result);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
@ -42,7 +43,11 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
* </p>
|
* </p>
|
||||||
* @see KeywordMarkerTokenFilter
|
* @see KeywordMarkerTokenFilter
|
||||||
|
* @deprecated Use {@link SnowballFilter} with
|
||||||
|
* {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the
|
||||||
|
* same functionality. This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public final class DutchStemFilter extends TokenFilter {
|
public final class DutchStemFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* The actual token in the input stream.
|
* The actual token in the input stream.
|
||||||
|
|
|
@ -26,8 +26,10 @@ import java.util.Map;
|
||||||
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
|
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
|
||||||
* algorithm in Martin Porter's snowball project.
|
* algorithm in Martin Porter's snowball project.
|
||||||
* </p>
|
* </p>
|
||||||
|
* @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
|
||||||
|
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class DutchStemmer {
|
public class DutchStemmer {
|
||||||
/**
|
/**
|
||||||
* Buffer for the terms while stemming them.
|
* Buffer for the terms while stemming them.
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Norwegian.
|
||||||
|
*/
|
||||||
|
public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Norwegian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public NorwegianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new NorwegianStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Norwegian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.payloads;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
/** Set the positionIncrement of all tokens to the "positionIncrement",
|
/** Set the positionIncrement of all tokens to the "positionIncrement",
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Portuguese.
|
||||||
|
*/
|
||||||
|
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Portuguese stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public PortugueseAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new PortugueseStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Portuguese.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,133 @@
|
||||||
|
package org.apache.lucene.analysis.ro;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.RomanianStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Romanian.
|
||||||
|
*/
|
||||||
|
public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Romanian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
/**
|
||||||
|
* The comment character in the stopwords file.
|
||||||
|
* All lines prefixed with this will be ignored.
|
||||||
|
*/
|
||||||
|
private static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadStopwordSet(false, RomanianAnalyzer.class,
|
||||||
|
DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public RomanianAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new RomanianStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Romanian.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -26,11 +27,15 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -40,13 +45,22 @@ import org.apache.lucene.util.Version;
|
||||||
* will not be indexed at all).
|
* will not be indexed at all).
|
||||||
* A default set of stopwords is used unless an alternative list is specified.
|
* A default set of stopwords is used unless an alternative list is specified.
|
||||||
* </p>
|
* </p>
|
||||||
|
* <a name="version"/>
|
||||||
|
* <p>You must specify the required {@link Version}
|
||||||
|
* compatibility when creating RussianAnalyzer:
|
||||||
|
* <ul>
|
||||||
|
* <li> As of 3.1, StandardTokenizer is used, Snowball stemming is done with
|
||||||
|
* SnowballFilter, and Snowball stopwords are used by default.
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public final class RussianAnalyzer extends StopwordAnalyzerBase
|
public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* List of typical Russian stopwords.
|
* List of typical Russian stopwords. (for backwards compatibility)
|
||||||
|
* @deprecated Remove this for LUCENE 4.0
|
||||||
*/
|
*/
|
||||||
private static final String[] RUSSIAN_STOP_WORDS = {
|
@Deprecated
|
||||||
|
private static final String[] RUSSIAN_STOP_WORDS_30 = {
|
||||||
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
|
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
|
||||||
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
|
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
|
||||||
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
|
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
|
||||||
|
@ -59,10 +73,27 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
|
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** File containing default Russian stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
/** @deprecated remove this for Lucene 4.0 */
|
||||||
|
@Deprecated
|
||||||
|
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
||||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||||
Arrays.asList(RUSSIAN_STOP_WORDS), false));
|
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET =
|
||||||
|
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Set<?> stemExclusionSet;
|
private final Set<?> stemExclusionSet;
|
||||||
|
@ -77,7 +108,9 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
}
|
}
|
||||||
|
|
||||||
public RussianAnalyzer(Version matchVersion) {
|
public RussianAnalyzer(Version matchVersion) {
|
||||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
this(matchVersion,
|
||||||
|
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||||
|
: DefaultSetHolder.DEFAULT_STOP_SET_30);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -132,19 +165,30 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
* provided {@link Reader}.
|
* provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link TokenStreamComponents} built from a
|
||||||
* {@link RussianLetterTokenizer} filtered with
|
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||||
* and {@link RussianStemFilter}
|
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
||||||
|
* and {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
|
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
|
||||||
|
result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
} else {
|
||||||
final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
|
final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
|
||||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||||
result = new StopFilter(matchVersion, result, stopwords);
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
if(!stemExclusionSet.isEmpty())
|
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
|
||||||
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
result, stemExclusionSet);
|
||||||
return new TokenStreamComponents(source, new RussianStemFilter(result));
|
return new TokenStreamComponents(source, new RussianStemFilter(result));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.analysis.CharTokenizer;
|
import org.apache.lucene.analysis.CharTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
||||||
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -36,7 +37,10 @@ import org.apache.lucene.util.Version;
|
||||||
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
||||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
|
* @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
|
||||||
|
* This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class RussianLetterTokenizer extends CharTokenizer
|
public class RussianLetterTokenizer extends CharTokenizer
|
||||||
{
|
{
|
||||||
private static final int DIGIT_0 = '0';
|
private static final int DIGIT_0 = '0';
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
|
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -40,7 +41,11 @@ import java.io.IOException;
|
||||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
* </p>
|
* </p>
|
||||||
* @see KeywordMarkerTokenFilter
|
* @see KeywordMarkerTokenFilter
|
||||||
|
* @deprecated Use {@link SnowballFilter} with
|
||||||
|
* {@link org.tartarus.snowball.ext.RussianStemmer} instead, which has the
|
||||||
|
* same functionality. This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public final class RussianStemFilter extends TokenFilter
|
public final class RussianStemFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -19,7 +19,10 @@ package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
|
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
|
||||||
|
* @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
|
||||||
|
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
class RussianStemmer
|
class RussianStemmer
|
||||||
{
|
{
|
||||||
// positions of RV, R1 and R2 respectively
|
// positions of RV, R1 and R2 respectively
|
||||||
|
|
|
@ -39,7 +39,10 @@ import java.util.Set;
|
||||||
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
|
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
|
||||||
* </ul>
|
* </ul>
|
||||||
* </p>
|
* </p>
|
||||||
|
* @deprecated Use the language-specific analyzer in contrib/analyzers instead.
|
||||||
|
* This analyzer will be removed in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public final class SnowballAnalyzer extends Analyzer {
|
public final class SnowballAnalyzer extends Analyzer {
|
||||||
private String name;
|
private String name;
|
||||||
private Set<?> stopSet;
|
private Set<?> stopSet;
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
|
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
|
||||||
|
@ -39,14 +40,14 @@ import org.tartarus.snowball.SnowballProgram;
|
||||||
*/
|
*/
|
||||||
public final class SnowballFilter extends TokenFilter {
|
public final class SnowballFilter extends TokenFilter {
|
||||||
|
|
||||||
private SnowballProgram stemmer;
|
private final SnowballProgram stemmer;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
||||||
super(input);
|
super(input);
|
||||||
this.stemmer = stemmer;
|
this.stemmer = stemmer;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -67,13 +68,13 @@ public final class SnowballFilter extends TokenFilter {
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e.toString());
|
throw new RuntimeException(e.toString());
|
||||||
}
|
}
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next input Token, after being stemmed */
|
/** Returns the next input Token, after being stemmed */
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
char termBuffer[] = termAtt.termBuffer();
|
char termBuffer[] = termAtt.termBuffer();
|
||||||
final int length = termAtt.termLength();
|
final int length = termAtt.termLength();
|
||||||
stemmer.setCurrent(termBuffer, length);
|
stemmer.setCurrent(termBuffer, length);
|
||||||
|
@ -84,6 +85,7 @@ public final class SnowballFilter extends TokenFilter {
|
||||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
||||||
else
|
else
|
||||||
termAtt.setTermLength(newLength);
|
termAtt.setTermLength(newLength);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.SwedishStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Swedish.
|
||||||
|
*/
|
||||||
|
public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Swedish stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||||
|
DEFAULT_STOPWORD_FILE);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public SwedishAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new LowerCaseFilter(matchVersion, result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new SwedishStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Swedish.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.th;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.lang.Character.UnicodeBlock;
|
import java.lang.Character.UnicodeBlock;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
|
@ -0,0 +1,132 @@
|
||||||
|
package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.tartarus.snowball.ext.TurkishStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Analyzer} for Turkish.
|
||||||
|
*/
|
||||||
|
public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final Set<?> stemExclusionSet;
|
||||||
|
|
||||||
|
/** File containing default Turkish stopwords. */
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
/**
|
||||||
|
* The comment character in the stopwords file.
|
||||||
|
* All lines prefixed with this will be ignored.
|
||||||
|
*/
|
||||||
|
private static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
|
* @return default stop words set.
|
||||||
|
*/
|
||||||
|
public static Set<?> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<?> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadStopwordSet(false, TurkishAnalyzer.class,
|
||||||
|
DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public TurkishAnalyzer(Version matchVersion) {
|
||||||
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||||
|
* provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
|
||||||
|
* stemming.
|
||||||
|
*
|
||||||
|
* @param matchVersion lucene compatibility version
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
|
*/
|
||||||
|
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||||
|
super(matchVersion, stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
matchVersion, stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||||
|
* {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||||
|
* filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter},
|
||||||
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
TokenStream result = new StandardFilter(source);
|
||||||
|
result = new TurkishLowerCaseFilter(result);
|
||||||
|
result = new StopFilter(matchVersion, result, stopwords);
|
||||||
|
if(!stemExclusionSet.isEmpty())
|
||||||
|
result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
|
||||||
|
result = new SnowballFilter(result, new TurkishStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,15 +17,6 @@
|
||||||
-->
|
-->
|
||||||
<html><head></head>
|
<html><head></head>
|
||||||
<body>
|
<body>
|
||||||
Support for Turkish.
|
Analyzer for Turkish.
|
||||||
<p>
|
|
||||||
This package contains just the TokenStream for handling turkish casing,
|
|
||||||
for a stemmer please see the snowball package.
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the
|
|
||||||
language is set to Turkish, so you will need to construct your own
|
|
||||||
analyzer that combines TurkishLowerCaseFilter and SnowballFilter.
|
|
||||||
</p>
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
|
@ -0,0 +1,233 @@
|
||||||
|
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
acea
|
||||||
|
aceasta
|
||||||
|
această
|
||||||
|
aceea
|
||||||
|
acei
|
||||||
|
aceia
|
||||||
|
acel
|
||||||
|
acela
|
||||||
|
acele
|
||||||
|
acelea
|
||||||
|
acest
|
||||||
|
acesta
|
||||||
|
aceste
|
||||||
|
acestea
|
||||||
|
aceşti
|
||||||
|
aceştia
|
||||||
|
acolo
|
||||||
|
acum
|
||||||
|
ai
|
||||||
|
aia
|
||||||
|
aibă
|
||||||
|
aici
|
||||||
|
al
|
||||||
|
ăla
|
||||||
|
ale
|
||||||
|
alea
|
||||||
|
ălea
|
||||||
|
altceva
|
||||||
|
altcineva
|
||||||
|
am
|
||||||
|
ar
|
||||||
|
are
|
||||||
|
aş
|
||||||
|
aşadar
|
||||||
|
asemenea
|
||||||
|
asta
|
||||||
|
ăsta
|
||||||
|
astăzi
|
||||||
|
astea
|
||||||
|
ăstea
|
||||||
|
ăştia
|
||||||
|
asupra
|
||||||
|
aţi
|
||||||
|
au
|
||||||
|
avea
|
||||||
|
avem
|
||||||
|
aveţi
|
||||||
|
azi
|
||||||
|
bine
|
||||||
|
bucur
|
||||||
|
bună
|
||||||
|
ca
|
||||||
|
că
|
||||||
|
căci
|
||||||
|
când
|
||||||
|
care
|
||||||
|
cărei
|
||||||
|
căror
|
||||||
|
cărui
|
||||||
|
cât
|
||||||
|
câte
|
||||||
|
câţi
|
||||||
|
către
|
||||||
|
câtva
|
||||||
|
ce
|
||||||
|
cel
|
||||||
|
ceva
|
||||||
|
chiar
|
||||||
|
cînd
|
||||||
|
cine
|
||||||
|
cineva
|
||||||
|
cît
|
||||||
|
cîte
|
||||||
|
cîţi
|
||||||
|
cîtva
|
||||||
|
contra
|
||||||
|
cu
|
||||||
|
cum
|
||||||
|
cumva
|
||||||
|
curând
|
||||||
|
curînd
|
||||||
|
da
|
||||||
|
dă
|
||||||
|
dacă
|
||||||
|
dar
|
||||||
|
datorită
|
||||||
|
de
|
||||||
|
deci
|
||||||
|
deja
|
||||||
|
deoarece
|
||||||
|
departe
|
||||||
|
deşi
|
||||||
|
din
|
||||||
|
dinaintea
|
||||||
|
dintr
|
||||||
|
dintre
|
||||||
|
drept
|
||||||
|
după
|
||||||
|
ea
|
||||||
|
ei
|
||||||
|
el
|
||||||
|
ele
|
||||||
|
eram
|
||||||
|
este
|
||||||
|
eşti
|
||||||
|
eu
|
||||||
|
face
|
||||||
|
fără
|
||||||
|
fi
|
||||||
|
fie
|
||||||
|
fiecare
|
||||||
|
fii
|
||||||
|
fim
|
||||||
|
fiţi
|
||||||
|
iar
|
||||||
|
ieri
|
||||||
|
îi
|
||||||
|
îl
|
||||||
|
îmi
|
||||||
|
împotriva
|
||||||
|
în
|
||||||
|
înainte
|
||||||
|
înaintea
|
||||||
|
încât
|
||||||
|
încît
|
||||||
|
încotro
|
||||||
|
între
|
||||||
|
întrucât
|
||||||
|
întrucît
|
||||||
|
îţi
|
||||||
|
la
|
||||||
|
lângă
|
||||||
|
le
|
||||||
|
li
|
||||||
|
lîngă
|
||||||
|
lor
|
||||||
|
lui
|
||||||
|
mă
|
||||||
|
mâine
|
||||||
|
mea
|
||||||
|
mei
|
||||||
|
mele
|
||||||
|
mereu
|
||||||
|
meu
|
||||||
|
mi
|
||||||
|
mine
|
||||||
|
mult
|
||||||
|
multă
|
||||||
|
mulţi
|
||||||
|
ne
|
||||||
|
nicăieri
|
||||||
|
nici
|
||||||
|
nimeni
|
||||||
|
nişte
|
||||||
|
noastră
|
||||||
|
noastre
|
||||||
|
noi
|
||||||
|
noştri
|
||||||
|
nostru
|
||||||
|
nu
|
||||||
|
ori
|
||||||
|
oricând
|
||||||
|
oricare
|
||||||
|
oricât
|
||||||
|
orice
|
||||||
|
oricînd
|
||||||
|
oricine
|
||||||
|
oricît
|
||||||
|
oricum
|
||||||
|
oriunde
|
||||||
|
până
|
||||||
|
pe
|
||||||
|
pentru
|
||||||
|
peste
|
||||||
|
pînă
|
||||||
|
poate
|
||||||
|
pot
|
||||||
|
prea
|
||||||
|
prima
|
||||||
|
primul
|
||||||
|
prin
|
||||||
|
printr
|
||||||
|
sa
|
||||||
|
să
|
||||||
|
săi
|
||||||
|
sale
|
||||||
|
sau
|
||||||
|
său
|
||||||
|
se
|
||||||
|
şi
|
||||||
|
sînt
|
||||||
|
sîntem
|
||||||
|
sînteţi
|
||||||
|
spre
|
||||||
|
sub
|
||||||
|
sunt
|
||||||
|
suntem
|
||||||
|
sunteţi
|
||||||
|
ta
|
||||||
|
tăi
|
||||||
|
tale
|
||||||
|
tău
|
||||||
|
te
|
||||||
|
ţi
|
||||||
|
ţie
|
||||||
|
tine
|
||||||
|
toată
|
||||||
|
toate
|
||||||
|
tot
|
||||||
|
toţi
|
||||||
|
totuşi
|
||||||
|
tu
|
||||||
|
un
|
||||||
|
una
|
||||||
|
unde
|
||||||
|
undeva
|
||||||
|
unei
|
||||||
|
unele
|
||||||
|
uneori
|
||||||
|
unor
|
||||||
|
vă
|
||||||
|
vi
|
||||||
|
voastră
|
||||||
|
voastre
|
||||||
|
voi
|
||||||
|
voştri
|
||||||
|
vostru
|
||||||
|
vouă
|
||||||
|
vreo
|
||||||
|
vreun
|
|
@ -0,0 +1,212 @@
|
||||||
|
# Turkish stopwords from LUCENE-559
|
||||||
|
# merged with the list from "Information Retrieval on Turkish Texts"
|
||||||
|
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
|
||||||
|
acaba
|
||||||
|
altmış
|
||||||
|
altı
|
||||||
|
ama
|
||||||
|
ancak
|
||||||
|
arada
|
||||||
|
aslında
|
||||||
|
ayrıca
|
||||||
|
bana
|
||||||
|
bazı
|
||||||
|
belki
|
||||||
|
ben
|
||||||
|
benden
|
||||||
|
beni
|
||||||
|
benim
|
||||||
|
beri
|
||||||
|
beş
|
||||||
|
bile
|
||||||
|
bin
|
||||||
|
bir
|
||||||
|
birçok
|
||||||
|
biri
|
||||||
|
birkaç
|
||||||
|
birkez
|
||||||
|
birşey
|
||||||
|
birşeyi
|
||||||
|
biz
|
||||||
|
bize
|
||||||
|
bizden
|
||||||
|
bizi
|
||||||
|
bizim
|
||||||
|
böyle
|
||||||
|
böylece
|
||||||
|
bu
|
||||||
|
buna
|
||||||
|
bunda
|
||||||
|
bundan
|
||||||
|
bunlar
|
||||||
|
bunları
|
||||||
|
bunların
|
||||||
|
bunu
|
||||||
|
bunun
|
||||||
|
burada
|
||||||
|
çok
|
||||||
|
çünkü
|
||||||
|
da
|
||||||
|
daha
|
||||||
|
dahi
|
||||||
|
de
|
||||||
|
defa
|
||||||
|
değil
|
||||||
|
diğer
|
||||||
|
diye
|
||||||
|
doksan
|
||||||
|
dokuz
|
||||||
|
dolayı
|
||||||
|
dolayısıyla
|
||||||
|
dört
|
||||||
|
edecek
|
||||||
|
eden
|
||||||
|
ederek
|
||||||
|
edilecek
|
||||||
|
ediliyor
|
||||||
|
edilmesi
|
||||||
|
ediyor
|
||||||
|
eğer
|
||||||
|
elli
|
||||||
|
en
|
||||||
|
etmesi
|
||||||
|
etti
|
||||||
|
ettiği
|
||||||
|
ettiğini
|
||||||
|
gibi
|
||||||
|
göre
|
||||||
|
halen
|
||||||
|
hangi
|
||||||
|
hatta
|
||||||
|
hem
|
||||||
|
henüz
|
||||||
|
hep
|
||||||
|
hepsi
|
||||||
|
her
|
||||||
|
herhangi
|
||||||
|
herkesin
|
||||||
|
hiç
|
||||||
|
hiçbir
|
||||||
|
için
|
||||||
|
iki
|
||||||
|
ile
|
||||||
|
ilgili
|
||||||
|
ise
|
||||||
|
işte
|
||||||
|
itibaren
|
||||||
|
itibariyle
|
||||||
|
kadar
|
||||||
|
karşın
|
||||||
|
katrilyon
|
||||||
|
kendi
|
||||||
|
kendilerine
|
||||||
|
kendini
|
||||||
|
kendisi
|
||||||
|
kendisine
|
||||||
|
kendisini
|
||||||
|
kez
|
||||||
|
ki
|
||||||
|
kim
|
||||||
|
kimden
|
||||||
|
kime
|
||||||
|
kimi
|
||||||
|
kimse
|
||||||
|
kırk
|
||||||
|
milyar
|
||||||
|
milyon
|
||||||
|
mu
|
||||||
|
mü
|
||||||
|
mı
|
||||||
|
nasıl
|
||||||
|
ne
|
||||||
|
neden
|
||||||
|
nedenle
|
||||||
|
nerde
|
||||||
|
nerede
|
||||||
|
nereye
|
||||||
|
niye
|
||||||
|
niçin
|
||||||
|
o
|
||||||
|
olan
|
||||||
|
olarak
|
||||||
|
oldu
|
||||||
|
olduğu
|
||||||
|
olduğunu
|
||||||
|
olduklarını
|
||||||
|
olmadı
|
||||||
|
olmadığı
|
||||||
|
olmak
|
||||||
|
olması
|
||||||
|
olmayan
|
||||||
|
olmaz
|
||||||
|
olsa
|
||||||
|
olsun
|
||||||
|
olup
|
||||||
|
olur
|
||||||
|
olursa
|
||||||
|
oluyor
|
||||||
|
on
|
||||||
|
ona
|
||||||
|
ondan
|
||||||
|
onlar
|
||||||
|
onlardan
|
||||||
|
onları
|
||||||
|
onların
|
||||||
|
onu
|
||||||
|
onun
|
||||||
|
otuz
|
||||||
|
oysa
|
||||||
|
öyle
|
||||||
|
pek
|
||||||
|
rağmen
|
||||||
|
sadece
|
||||||
|
sanki
|
||||||
|
sekiz
|
||||||
|
seksen
|
||||||
|
sen
|
||||||
|
senden
|
||||||
|
seni
|
||||||
|
senin
|
||||||
|
siz
|
||||||
|
sizden
|
||||||
|
sizi
|
||||||
|
sizin
|
||||||
|
şey
|
||||||
|
şeyden
|
||||||
|
şeyi
|
||||||
|
şeyler
|
||||||
|
şöyle
|
||||||
|
şu
|
||||||
|
şuna
|
||||||
|
şunda
|
||||||
|
şundan
|
||||||
|
şunları
|
||||||
|
şunu
|
||||||
|
tarafından
|
||||||
|
trilyon
|
||||||
|
tüm
|
||||||
|
üç
|
||||||
|
üzere
|
||||||
|
var
|
||||||
|
vardı
|
||||||
|
ve
|
||||||
|
veya
|
||||||
|
ya
|
||||||
|
yani
|
||||||
|
yapacak
|
||||||
|
yapılan
|
||||||
|
yapılması
|
||||||
|
yapıyor
|
||||||
|
yapmak
|
||||||
|
yaptı
|
||||||
|
yaptığı
|
||||||
|
yaptığını
|
||||||
|
yaptıkları
|
||||||
|
yedi
|
||||||
|
yerine
|
||||||
|
yetmiş
|
||||||
|
yine
|
||||||
|
yirmi
|
||||||
|
yoksa
|
||||||
|
yüz
|
||||||
|
zaten
|
|
@ -22,8 +22,6 @@ import java.util.Collections;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import javax.print.DocFlavor.CHAR_ARRAY;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.da;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new DanishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||||
|
checkOneTermReuse(a, "undersøgelse", "undersøg");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "på", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("undersøgelse");
|
||||||
|
Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||||
|
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,93 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
public void testReusableTokenStream() throws Exception {
|
||||||
|
Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
checkOneTermReuse(a, "Tisch", "tisch");
|
||||||
|
checkOneTermReuse(a, "Tische", "tisch");
|
||||||
|
checkOneTermReuse(a, "Tischen", "tisch");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testExclusionTableBWCompat() throws IOException {
|
||||||
|
GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
|
||||||
|
new StringReader("Fischen Trinken")));
|
||||||
|
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||||
|
set.add("fischen");
|
||||||
|
filter.setExclusionSet(set);
|
||||||
|
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
|
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||||
|
set.add("fischen");
|
||||||
|
GermanStemFilter filter = new GermanStemFilter(
|
||||||
|
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
||||||
|
"Fischen Trinken")), set));
|
||||||
|
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
||||||
|
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||||
|
set.add("fischen");
|
||||||
|
CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
||||||
|
set1.add("trinken");
|
||||||
|
set1.add("fischen");
|
||||||
|
GermanStemFilter filter = new GermanStemFilter(
|
||||||
|
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
||||||
|
"Fischen Trinken")), set));
|
||||||
|
filter.setExclusionSet(set1);
|
||||||
|
assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test that changes to the exclusion table are applied immediately
|
||||||
|
* when using reusable token streams.
|
||||||
|
*/
|
||||||
|
public void testExclusionTableReuse() throws Exception {
|
||||||
|
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
checkOneTermReuse(a, "tischen", "tisch");
|
||||||
|
a.setStemExclusionTable(new String[] { "tischen" });
|
||||||
|
checkOneTermReuse(a, "tischen", "tischen");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test some features of the new snowball filter
|
||||||
|
* these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
|
||||||
|
*/
|
||||||
|
public void testGermanSpecials() throws Exception {
|
||||||
|
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// a/o/u + e is equivalent to the umlaut form
|
||||||
|
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||||
|
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
|
||||||
|
// here they are with the old stemmer
|
||||||
|
a = new GermanAnalyzer(Version.LUCENE_30);
|
||||||
|
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||||
|
checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,15 +20,14 @@ package org.apache.lucene.analysis.de;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -40,6 +39,8 @@ import org.apache.lucene.util.Version;
|
||||||
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
||||||
|
TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
|
||||||
// read test cases from external file:
|
// read test cases from external file:
|
||||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
||||||
|
@ -55,68 +56,12 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
continue; // ignore comments and empty lines
|
continue; // ignore comments and empty lines
|
||||||
String[] parts = line.split(";");
|
String[] parts = line.split(";");
|
||||||
//System.out.println(parts[0] + " -- " + parts[1]);
|
//System.out.println(parts[0] + " -- " + parts[1]);
|
||||||
check(parts[0], parts[1]);
|
tokenizer.reset(new StringReader(parts[0]));
|
||||||
|
filter.reset();
|
||||||
|
assertTokenStreamContents(filter, new String[] { parts[1] });
|
||||||
}
|
}
|
||||||
breader.close();
|
breader.close();
|
||||||
isr.close();
|
isr.close();
|
||||||
fis.close();
|
fis.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
|
||||||
Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
|
||||||
checkReuse(a, "Tisch", "tisch");
|
|
||||||
checkReuse(a, "Tische", "tisch");
|
|
||||||
checkReuse(a, "Tischen", "tisch");
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testExclusionTableBWCompat() throws IOException {
|
|
||||||
GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
|
|
||||||
new StringReader("Fischen Trinken")));
|
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
|
||||||
set.add("fischen");
|
|
||||||
filter.setExclusionSet(set);
|
|
||||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
|
||||||
set.add("fischen");
|
|
||||||
GermanStemFilter filter = new GermanStemFilter(
|
|
||||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
|
||||||
"Fischen Trinken")), set));
|
|
||||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
|
||||||
set.add("fischen");
|
|
||||||
CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
|
||||||
set1.add("trinken");
|
|
||||||
set1.add("fischen");
|
|
||||||
GermanStemFilter filter = new GermanStemFilter(
|
|
||||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
|
||||||
"Fischen Trinken")), set));
|
|
||||||
filter.setExclusionSet(set1);
|
|
||||||
assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Test that changes to the exclusion table are applied immediately
|
|
||||||
* when using reusable token streams.
|
|
||||||
*/
|
|
||||||
public void testExclusionTableReuse() throws Exception {
|
|
||||||
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
|
||||||
checkReuse(a, "tischen", "tisch");
|
|
||||||
a.setStemExclusionTable(new String[] { "tischen" });
|
|
||||||
checkReuse(a, "tischen", "tischen");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws Exception {
|
|
||||||
checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void checkReuse(Analyzer a, String input, String expected) throws Exception {
|
|
||||||
checkOneTermReuse(a, input, expected);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -63,4 +62,23 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
|
||||||
|
* check that this is preserved.
|
||||||
|
* @deprecated remove this test in Lucene 4.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testAcronymBWCompat() throws Exception {
|
||||||
|
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
|
||||||
|
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test that acronym normalization works
|
||||||
|
*/
|
||||||
|
public void testAcronym() throws Exception {
|
||||||
|
Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
|
||||||
|
assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.en;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new EnglishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "books", "book");
|
||||||
|
checkOneTermReuse(a, "book", "book");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "the", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("books");
|
||||||
|
Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "books", "books");
|
||||||
|
checkOneTermReuse(a, "book", "book");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new SpanishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "chicana", "chican");
|
||||||
|
checkOneTermReuse(a, "chicano", "chican");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "los", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("chicano");
|
||||||
|
Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "chicana", "chican");
|
||||||
|
checkOneTermReuse(a, "chicano", "chicano");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.fi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new FinnishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||||
|
checkOneTermReuse(a, "edeltäjistään", "edeltäj");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "olla", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("edeltäjistään");
|
||||||
|
Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||||
|
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.fr;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -113,6 +115,94 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated remove this test for Lucene 4.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testAnalyzer30() throws Exception {
|
||||||
|
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
|
||||||
|
|
||||||
|
assertAnalyzesTo(fa, "", new String[] {
|
||||||
|
});
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"chien chat cheval",
|
||||||
|
new String[] { "chien", "chat", "cheval" });
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"chien CHAT CHEVAL",
|
||||||
|
new String[] { "chien", "chat", "cheval" });
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
" chien ,? + = - CHAT /: > CHEVAL",
|
||||||
|
new String[] { "chien", "chat", "cheval" });
|
||||||
|
|
||||||
|
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
|
||||||
|
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"mot \"entreguillemet\"",
|
||||||
|
new String[] { "mot", "entreguillemet" });
|
||||||
|
|
||||||
|
// let's do some french specific tests now
|
||||||
|
|
||||||
|
/* 1. couldn't resist
|
||||||
|
I would expect this to stay one term as in French the minus
|
||||||
|
sign is often used for composing words */
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"Jean-François",
|
||||||
|
new String[] { "jean", "françois" });
|
||||||
|
|
||||||
|
// 2. stopwords
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"le la chien les aux chat du des à cheval",
|
||||||
|
new String[] { "chien", "chat", "cheval" });
|
||||||
|
|
||||||
|
// some nouns and adjectives
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"lances chismes habitable chiste éléments captifs",
|
||||||
|
new String[] {
|
||||||
|
"lanc",
|
||||||
|
"chism",
|
||||||
|
"habit",
|
||||||
|
"chist",
|
||||||
|
"élément",
|
||||||
|
"captif" });
|
||||||
|
|
||||||
|
// some verbs
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"finissions souffrirent rugissante",
|
||||||
|
new String[] { "fin", "souffr", "rug" });
|
||||||
|
|
||||||
|
// some everything else
|
||||||
|
// aujourd'hui stays one term which is OK
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
|
||||||
|
new String[] {
|
||||||
|
"c3po",
|
||||||
|
"aujourd'hui",
|
||||||
|
"oeuf",
|
||||||
|
"ïâöûàä",
|
||||||
|
"anticonstitutionnel",
|
||||||
|
"jav" });
|
||||||
|
|
||||||
|
// some more everything else
|
||||||
|
// here 1940-1945 stays as one term, 1940:1945 not ?
|
||||||
|
assertAnalyzesTo(
|
||||||
|
fa,
|
||||||
|
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||||||
|
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||||||
// stopwords
|
// stopwords
|
||||||
|
@ -157,4 +247,28 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
|
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
|
||||||
"chist" });
|
"chist" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testElision() throws Exception {
|
||||||
|
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||||
|
* stopwords were case sensitive. Preserve this for back compat.
|
||||||
|
* @deprecated Remove this test in Lucene 4.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testBuggyStopwordsCasing() throws IOException {
|
||||||
|
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
|
||||||
|
assertAnalyzesTo(a, "Votre", new String[] { "votr" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that stopwords are not case sensitive
|
||||||
|
*/
|
||||||
|
public void testStopwordsCasing() throws IOException {
|
||||||
|
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
|
||||||
|
assertAnalyzesTo(a, "Votre", new String[] { });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.hu;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new HungarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "babakocsi", "babakocs");
|
||||||
|
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "által", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("babakocsi");
|
||||||
|
Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||||
|
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.it;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new ItalianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "abbandonata", "abbandon");
|
||||||
|
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "dallo", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("abbandonata");
|
||||||
|
Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||||
|
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.PorterStemFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
||||||
|
public void testOverride() throws IOException {
|
||||||
|
// lets make booked stem to books
|
||||||
|
// the override filter will convert "booked" to "books",
|
||||||
|
// but also mark it with KeywordAttribute so Porter will not change it.
|
||||||
|
Map<String,String> dictionary = new HashMap<String,String>();
|
||||||
|
dictionary.put("booked", "books");
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
||||||
|
TokenStream stream = new PorterStemFilter(
|
||||||
|
new StemmerOverrideFilter(Version.LUCENE_CURRENT, tokenizer, dictionary));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "books" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -22,7 +22,6 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -18,10 +18,8 @@ package org.apache.lucene.analysis.ngram;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -100,9 +100,6 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
check("ophalend", "ophal");
|
check("ophalend", "ophal");
|
||||||
check("ophalers", "ophaler");
|
check("ophalers", "ophaler");
|
||||||
check("ophef", "ophef");
|
check("ophef", "ophef");
|
||||||
check("opheffen", "ophef"); // versus snowball 'opheff'
|
|
||||||
check("opheffende", "ophef"); // versus snowball 'opheff'
|
|
||||||
check("opheffing", "ophef"); // versus snowball 'opheff'
|
|
||||||
check("opheldering", "ophelder");
|
check("opheldering", "ophelder");
|
||||||
check("ophemelde", "ophemeld");
|
check("ophemelde", "ophemeld");
|
||||||
check("ophemelen", "ophemel");
|
check("ophemelen", "ophemel");
|
||||||
|
@ -118,6 +115,24 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
check("ophouden", "ophoud");
|
check("ophouden", "ophoud");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated remove this test in Lucene 4.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testOldBuggyStemmer() throws Exception {
|
||||||
|
Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
|
||||||
|
checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
|
||||||
|
checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
|
||||||
|
checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSnowballCorrectness() throws Exception {
|
||||||
|
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
checkOneTermReuse(a, "opheffen", "opheff");
|
||||||
|
checkOneTermReuse(a, "opheffende", "opheff");
|
||||||
|
checkOneTermReuse(a, "opheffing", "opheff");
|
||||||
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
||||||
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
||||||
|
@ -161,6 +176,25 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
|
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||||
|
* stopwords were case sensitive. Preserve this for back compat.
|
||||||
|
* @deprecated Remove this test in Lucene 4.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testBuggyStopwordsCasing() throws IOException {
|
||||||
|
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
|
||||||
|
assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that stopwords are not case sensitive
|
||||||
|
*/
|
||||||
|
public void testStopwordsCasing() throws IOException {
|
||||||
|
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
|
||||||
|
assertAnalyzesTo(a, "Zelf", new String[] { });
|
||||||
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws Exception {
|
private void check(final String input, final String expected) throws Exception {
|
||||||
checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new NorwegianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
|
||||||
|
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "det", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("havnedistriktene");
|
||||||
|
Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||||
|
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new PortugueseAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "quilométricas", "quilométr");
|
||||||
|
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "não", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("quilométricas");
|
||||||
|
Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||||
|
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.ro;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new RomanianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "absenţa", "absenţ");
|
||||||
|
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "îl", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("absenţa");
|
||||||
|
Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||||
|
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||||
|
}
|
||||||
|
}
|
|
@ -50,9 +50,14 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnicode() throws IOException
|
/**
|
||||||
|
* @deprecated remove this test and its datafiles in Lucene 4.0
|
||||||
|
* the Snowball version has its own data tests.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testUnicode30() throws IOException
|
||||||
{
|
{
|
||||||
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30);
|
||||||
inWords =
|
inWords =
|
||||||
new InputStreamReader(
|
new InputStreamReader(
|
||||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
|
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
|
||||||
|
@ -110,12 +115,22 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated remove this test in Lucene 4.0: stopwords changed */
|
||||||
|
@Deprecated
|
||||||
|
public void testReusableTokenStream30() throws Exception {
|
||||||
|
Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
|
||||||
|
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||||
|
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||||
|
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||||
|
new String[] { "знан", "хран", "тайн" });
|
||||||
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
||||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||||
new String[] { "знан", "хран", "тайн" });
|
new String[] { "знан", "эт", "хран", "тайн" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Testcase for {@link RussianLetterTokenizer}
|
* Testcase for {@link RussianLetterTokenizer}
|
||||||
|
* @deprecated Remove this test class in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase {
|
public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testRussianLetterTokenizer() throws IOException {
|
public void testRussianLetterTokenizer() throws IOException {
|
||||||
|
|
|
@ -24,6 +24,10 @@ import java.io.InputStreamReader;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class TestRussianStem extends LuceneTestCase
|
public class TestRussianStem extends LuceneTestCase
|
||||||
{
|
{
|
||||||
private ArrayList words = new ArrayList();
|
private ArrayList words = new ArrayList();
|
||||||
|
|
|
@ -22,11 +22,8 @@ import java.io.StringReader;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|
||||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new SwedishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
|
||||||
|
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "och", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("jaktkarlarne");
|
||||||
|
Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||||
|
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the
|
||||||
|
* stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new TurkishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test stopwords and stemming */
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT);
|
||||||
|
// stemming
|
||||||
|
checkOneTermReuse(a, "ağacı", "ağaç");
|
||||||
|
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||||
|
// stopword
|
||||||
|
assertAnalyzesTo(a, "dolayı", new String[] {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test use of exclusion set */
|
||||||
|
public void testExclude() throws IOException {
|
||||||
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
|
exclusionSet.add("ağacı");
|
||||||
|
Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT,
|
||||||
|
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||||
|
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue