LUCENE-3086: add ElisionFilter to ItalianAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-05-11 22:43:54 +00:00
parent b3bb2aa0ac
commit a871b29ed6
4 changed files with 41 additions and 3 deletions

View File

@ -50,6 +50,11 @@ Bug Fixes
======================= Lucene 3.x (not yet released) =======================
Changes in runtime behavior
* LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian
contractions by default. (Robert Muir)
Bug Fixes
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was

View File

@ -31,8 +31,6 @@ import org.apache.lucene.util.Version;
/**
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
* tokenized as "avion" (plane).
* <p>
* Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
*
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
*/

View File

@ -19,11 +19,13 @@ package org.apache.lucene.analysis.it;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.ItalianStemmer;
/**
* {@link Analyzer} for Italian.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ItalianAnalyzer:
* <ul>
* <li> As of 3.2, ElisionFilter with a set of Italian
* contractions is used by default.
* </ul>
*/
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
@ -45,6 +55,13 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Italian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(
"c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
), true));
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
@ -112,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* @return A
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@ -121,6 +138,9 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_32)) {
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
}
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
}
/** test that the elisionfilter is working */
public void testContractions() throws IOException {
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
}
/** test that we don't enable this before 3.2*/
public void testContractionsBackwards() throws IOException {
Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
}
}