From dac1b58277c41d66197bbe2a11cc14a4a406e99c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 8 Feb 2012 12:07:52 +0000 Subject: [PATCH] SOLR-3097, SOLR-3105: add fieldtypes for different languages to the example git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1241878 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 8 + .../lucene/analysis/ca/CatalanAnalyzer.java | 19 + .../lucene/analysis/de/GermanAnalyzer.java | 11 +- .../de/GermanNormalizationFilter.java | 112 +++++ .../lucene/analysis/es/SpanishAnalyzer.java | 15 +- .../lucene/analysis/fr/FrenchAnalyzer.java | 9 +- .../gl/GalicianMinimalStemFilter.java | 58 +++ .../analysis/gl/GalicianMinimalStemmer.java | 38 ++ .../lucene/analysis/hi/HindiAnalyzer.java | 17 +- .../lucene/analysis/in/IndicTokenizer.java | 3 + .../lucene/analysis/it/ItalianAnalyzer.java | 9 +- .../analysis/pt/PortugueseAnalyzer.java | 15 +- .../analysis/pt/PortugueseMinimalStemmer.java | 2 +- .../lucene/analysis/th/ThaiAnalyzer.java | 71 ++- .../apache/lucene/analysis/th/stopwords.txt | 119 +++++ .../analysis/ca/TestCatalanAnalyzer.java | 7 + .../de/TestGermanNormalizationFilter.java | 68 +++ .../analysis/fr/TestFrenchAnalyzer.java | 24 +- .../gl/TestGalicianMinimalStemFilter.java | 55 +++ .../analysis/it/TestItalianAnalyzer.java | 8 +- .../analysis/pt/TestPortugueseAnalyzer.java | 6 +- .../lucene/analysis/th/TestThaiAnalyzer.java | 26 +- solr/CHANGES.txt | 6 + solr/build.xml | 94 ++++ .../solr/analysis/ElisionFilterFactory.java | 6 +- .../GalicianMinimalStemFilterFactory.java | 39 ++ .../GermanNormalizationFilterFactory.java | 39 ++ .../analysis/TestElisionFilterFactory.java | 18 + .../TestGalicianMinimalStemFilterFactory.java | 36 ++ .../TestGermanNormalizationFilterFactory.java | 36 ++ .../solr/conf/lang/contractions_ca.txt | 8 + .../solr/conf/lang/contractions_fr.txt | 9 + .../solr/conf/lang/contractions_it.txt | 23 + solr/example/solr/conf/lang/stemdict_nl.txt | 6 + solr/example/solr/conf/lang/stoptags_ja.txt | 420 ++++++++++++++++++ solr/example/solr/conf/lang/stopwords_ar.txt | 123 +++++ solr/example/solr/conf/lang/stopwords_bg.txt | 193 ++++++++ solr/example/solr/conf/lang/stopwords_ca.txt | 220 +++++++++ solr/example/solr/conf/lang/stopwords_cz.txt | 172 +++++++ solr/example/solr/conf/lang/stopwords_da.txt | 108 +++++ solr/example/solr/conf/lang/stopwords_de.txt | 292 ++++++++++++ solr/example/solr/conf/lang/stopwords_el.txt | 76 ++++ .../solr/conf/{ => lang}/stopwords_en.txt | 0 solr/example/solr/conf/lang/stopwords_es.txt | 354 +++++++++++++++ solr/example/solr/conf/lang/stopwords_eu.txt | 99 +++++ solr/example/solr/conf/lang/stopwords_fa.txt | 311 +++++++++++++ solr/example/solr/conf/lang/stopwords_fi.txt | 95 ++++ solr/example/solr/conf/lang/stopwords_fr.txt | 183 ++++++++ solr/example/solr/conf/lang/stopwords_gl.txt | 161 +++++++ solr/example/solr/conf/lang/stopwords_hi.txt | 231 ++++++++++ solr/example/solr/conf/lang/stopwords_hu.txt | 209 +++++++++ solr/example/solr/conf/lang/stopwords_hy.txt | 46 ++ solr/example/solr/conf/lang/stopwords_id.txt | 359 +++++++++++++++ solr/example/solr/conf/lang/stopwords_it.txt | 301 +++++++++++++ solr/example/solr/conf/lang/stopwords_ja.txt | 122 +++++ solr/example/solr/conf/lang/stopwords_lv.txt | 172 +++++++ solr/example/solr/conf/lang/stopwords_nl.txt | 117 +++++ solr/example/solr/conf/lang/stopwords_no.txt | 192 ++++++++ solr/example/solr/conf/lang/stopwords_pt.txt | 251 +++++++++++ solr/example/solr/conf/lang/stopwords_ro.txt | 233 ++++++++++ solr/example/solr/conf/lang/stopwords_ru.txt | 241 ++++++++++ solr/example/solr/conf/lang/stopwords_sv.txt | 131 ++++++ solr/example/solr/conf/lang/stopwords_th.txt | 119 +++++ solr/example/solr/conf/lang/stopwords_tr.txt | 212 +++++++++ solr/example/solr/conf/schema.xml | 329 +++++++++++++- 65 files changed, 7039 insertions(+), 53 deletions(-) create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java create mode 100644 modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt create mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java create mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java create mode 100644 solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java create mode 100644 solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java create mode 100644 solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java create mode 100644 solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java create mode 100644 solr/example/solr/conf/lang/contractions_ca.txt create mode 100644 solr/example/solr/conf/lang/contractions_fr.txt create mode 100644 solr/example/solr/conf/lang/contractions_it.txt create mode 100644 solr/example/solr/conf/lang/stemdict_nl.txt create mode 100644 solr/example/solr/conf/lang/stoptags_ja.txt create mode 100644 solr/example/solr/conf/lang/stopwords_ar.txt create mode 100644 solr/example/solr/conf/lang/stopwords_bg.txt create mode 100644 solr/example/solr/conf/lang/stopwords_ca.txt create mode 100644 solr/example/solr/conf/lang/stopwords_cz.txt create mode 100644 solr/example/solr/conf/lang/stopwords_da.txt create mode 100644 solr/example/solr/conf/lang/stopwords_de.txt create mode 100644 solr/example/solr/conf/lang/stopwords_el.txt rename solr/example/solr/conf/{ => lang}/stopwords_en.txt (100%) create mode 100644 solr/example/solr/conf/lang/stopwords_es.txt create mode 100644 solr/example/solr/conf/lang/stopwords_eu.txt create mode 100644 solr/example/solr/conf/lang/stopwords_fa.txt create mode 100644 solr/example/solr/conf/lang/stopwords_fi.txt create mode 100644 solr/example/solr/conf/lang/stopwords_fr.txt create mode 100644 solr/example/solr/conf/lang/stopwords_gl.txt create mode 100644 solr/example/solr/conf/lang/stopwords_hi.txt create mode 100644 solr/example/solr/conf/lang/stopwords_hu.txt create mode 100644 solr/example/solr/conf/lang/stopwords_hy.txt create mode 100644 solr/example/solr/conf/lang/stopwords_id.txt create mode 100644 solr/example/solr/conf/lang/stopwords_it.txt create mode 100644 solr/example/solr/conf/lang/stopwords_ja.txt create mode 100644 solr/example/solr/conf/lang/stopwords_lv.txt create mode 100644 solr/example/solr/conf/lang/stopwords_nl.txt create mode 100644 solr/example/solr/conf/lang/stopwords_no.txt create mode 100644 solr/example/solr/conf/lang/stopwords_pt.txt create mode 100644 solr/example/solr/conf/lang/stopwords_ro.txt create mode 100644 solr/example/solr/conf/lang/stopwords_ru.txt create mode 100644 solr/example/solr/conf/lang/stopwords_sv.txt create mode 100644 solr/example/solr/conf/lang/stopwords_th.txt create mode 100644 solr/example/solr/conf/lang/stopwords_tr.txt diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 066d0a60bce..0ec6dca13f9 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -169,6 +169,14 @@ Changes in runtime behavior * LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work per segment. (Uwe Schindler) + + * SOLR-3105: When passed LUCENE_36 or greater as version, GermanAnalyzer, + SpanishAnalyzer, FrenchAnalyzer, ItalianAnalyzer, and PortugueseAnalyzer + use a lighter stemming approach, CatalanAnalyzer uses ElisionFilter + with a set of contractions, HindiAnalyzer uses StandardTokenizer, and + ThaiAnalyzer uses thai stopwords. Add GermanNormalizationFilter which applies + the Snowball German2 algorithm to ae/oe/ue and case-folds ß. Add + GalicianMinimalStemFilter for plural removal only. (Robert Muir) Optimizations diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java index eaaed17030b..4f95cad55f7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java @@ -19,11 +19,13 @@ package org.apache.lucene.analysis.ca; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -37,6 +39,14 @@ import org.tartarus.snowball.ext.CatalanStemmer; /** * {@link Analyzer} for Catalan. + *

+ * + *

You must specify the required {@link Version} + * compatibility when creating CatalanAnalyzer: + *

*/ public final class CatalanAnalyzer extends StopwordAnalyzerBase { private final Set stemExclusionSet; @@ -44,6 +54,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { /** File containing default Catalan stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( + new CharArraySet(Version.LUCENE_CURRENT, + Arrays.asList( + "d", "l", "m", "n", "s", "t" + ), true)); + /** * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. @@ -120,6 +136,9 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); + if (matchVersion.onOrAfter(Version.LUCENE_36)) { + result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 9abde8c249c..3d2dd4d9ef5 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -54,6 +54,7 @@ import org.tartarus.snowball.ext.German2Stemmer; *

You must specify the required {@link Version} * compatibility when creating GermanAnalyzer: *