diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index 46a60c87712..2e6021bf090 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -50,6 +50,11 @@ Bug Fixes
======================= Lucene 3.x (not yet released) =======================
+Changes in runtime behavior
+
+ * LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian
+ contractions by default. (Robert Muir)
+
Bug Fixes
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
index b43a5c3b0dc..507a114336a 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
@@ -31,8 +31,6 @@ import org.apache.lucene.util.Version;
/**
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
* tokenized as "avion" (plane).
- *
- * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
*
* @see Elision in Wikipedia
*/
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
index adb51f29d44..bd8cc47a40f 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
@@ -19,11 +19,13 @@ package org.apache.lucene.analysis.it;
import java.io.IOException;
import java.io.Reader;
+import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.ItalianStemmer;
/**
* {@link Analyzer} for Italian.
+ *
+ *
+ *
You must specify the required {@link Version}
+ * compatibility when creating ItalianAnalyzer:
+ *
+ * - As of 3.2, ElisionFilter with a set of Italian
+ * contractions is used by default.
+ *
*/
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
private final Set> stemExclusionSet;
@@ -45,6 +55,13 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Italian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
+ private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+ new CharArraySet(Version.LUCENE_CURRENT,
+ Arrays.asList(
+ "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
+ "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
+ ), true));
+
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
@@ -112,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* @return A
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+ * {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@@ -121,6 +138,9 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_32)) {
+ result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+ }
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
index ae4bf2f2d24..83d7a863b35 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
@@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
@@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
}
+
+ /** test that the elisionfilter is working */
+ public void testContractions() throws IOException {
+ Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+ }
+
+ /** test that we don't enable this before 3.2*/
+ public void testContractionsBackwards() throws IOException {
+ Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
+ }
}