diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 066d0a60bce..0ec6dca13f9 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -169,6 +169,14 @@ Changes in runtime behavior * LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work per segment. (Uwe Schindler) + + * SOLR-3105: When passed LUCENE_36 or greater as version, GermanAnalyzer, + SpanishAnalyzer, FrenchAnalyzer, ItalianAnalyzer, and PortugueseAnalyzer + use a lighter stemming approach, CatalanAnalyzer uses ElisionFilter + with a set of contractions, HindiAnalyzer uses StandardTokenizer, and + ThaiAnalyzer uses thai stopwords. Add GermanNormalizationFilter which applies + the Snowball German2 algorithm to ae/oe/ue and case-folds ß. Add + GalicianMinimalStemFilter for plural removal only. (Robert Muir) Optimizations diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java index eaaed17030b..4f95cad55f7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java @@ -19,11 +19,13 @@ package org.apache.lucene.analysis.ca; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -37,6 +39,14 @@ import org.tartarus.snowball.ext.CatalanStemmer; /** * {@link Analyzer} for Catalan. + *

+ * + *

You must specify the required {@link Version} + * compatibility when creating CatalanAnalyzer: + *

*/ public final class CatalanAnalyzer extends StopwordAnalyzerBase { private final Set stemExclusionSet; @@ -44,6 +54,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { /** File containing default Catalan stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( + new CharArraySet(Version.LUCENE_CURRENT, + Arrays.asList( + "d", "l", "m", "n", "s", "t" + ), true)); + /** * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. @@ -120,6 +136,9 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); + if (matchVersion.onOrAfter(Version.LUCENE_36)) { + result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 9abde8c249c..3d2dd4d9ef5 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -54,6 +54,7 @@ import org.tartarus.snowball.ext.German2Stemmer; *

You must specify the required {@link Version} * compatibility when creating GermanAnalyzer: *