diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 46a60c87712..2e6021bf090 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -50,6 +50,11 @@ Bug Fixes ======================= Lucene 3.x (not yet released) ======================= +Changes in runtime behavior + + * LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian + contractions by default. (Robert Muir) + Bug Fixes * LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java index b43a5c3b0dc..507a114336a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java @@ -31,8 +31,6 @@ import org.apache.lucene.util.Version; /** * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be * tokenized as "avion" (plane). - *

- * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out. * * @see Elision in Wikipedia */ diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java index adb51f29d44..bd8cc47a40f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java @@ -19,11 +19,13 @@ package org.apache.lucene.analysis.it; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.ItalianStemmer; /** * {@link Analyzer} for Italian. + *

+ * + *

You must specify the required {@link Version} + * compatibility when creating ItalianAnalyzer: + *

*/ public final class ItalianAnalyzer extends StopwordAnalyzerBase { private final Set stemExclusionSet; @@ -45,6 +55,13 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { /** File containing default Italian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt"; + private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( + new CharArraySet(Version.LUCENE_CURRENT, + Arrays.asList( + "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d" + ), true)); + /** * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. @@ -112,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { * @return A * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} + * {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @@ -121,6 +138,9 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); + if (matchVersion.onOrAfter(Version.LUCENE_32)) { + result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java index ae4bf2f2d24..83d7a863b35 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; public class TestItalianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); } + + /** test that the elisionfilter is working */ + public void testContractions() throws IOException { + Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT); + assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" }); + assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" }); + } + + /** test that we don't enable this before 3.2*/ + public void testContractionsBackwards() throws IOException { + Analyzer a = new ItalianAnalyzer(Version.LUCENE_31); + assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" }); + assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" }); + } }