mirror of https://github.com/apache/lucene.git
LUCENE-3086: add ElisionFilter to ItalianAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b3bb2aa0ac
commit
a871b29ed6
|
@ -50,6 +50,11 @@ Bug Fixes
|
|||
|
||||
======================= Lucene 3.x (not yet released) =======================
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian
|
||||
contractions by default. (Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
|
||||
|
|
|
@ -31,8 +31,6 @@ import org.apache.lucene.util.Version;
|
|||
/**
|
||||
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
|
||||
* tokenized as "avion" (plane).
|
||||
* <p>
|
||||
* Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
|
||||
*
|
||||
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
|
||||
*/
|
||||
|
|
|
@ -19,11 +19,13 @@ package org.apache.lucene.analysis.it;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.fr.ElisionFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.ItalianStemmer;
|
|||
|
||||
/**
|
||||
* {@link Analyzer} for Italian.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ItalianAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.2, ElisionFilter with a set of Italian
|
||||
* contractions is used by default.
|
||||
* </ul>
|
||||
*/
|
||||
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
|
@ -45,6 +55,13 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
/** File containing default Italian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
||||
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(
|
||||
"c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
|
||||
"gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
|
||||
), true));
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
|
@ -112,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @return A
|
||||
* {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
|
@ -121,6 +138,9 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_32)) {
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
}
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** test that the elisionfilter is working */
|
||||
public void testContractions() throws IOException {
|
||||
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
|
||||
assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
|
||||
}
|
||||
|
||||
/** test that we don't enable this before 3.2*/
|
||||
public void testContractionsBackwards() throws IOException {
|
||||
Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
|
||||
assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue