diff --git a/CHANGES.txt b/CHANGES.txt index 0db3e1139bb..0383f801a58 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -16,6 +16,9 @@ Bug fixes New features + 1. LUCENE-906: Elision filter for French. + (Mathieu Lecarme via Otis Gospodnetic) + Optimizations 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java new file mode 100644 index 00000000000..6dcee91d84f --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; +import java.util.HashSet; +import java.util.Arrays; +import java.util.Iterator; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenFilter; + +/** + * Removes elisions from a token stream. For example, "l'avion" (the plane) will be + * tokenized as "avion" (plane). + * + * @author Mathieu Lecarme + * @see{http://fr.wikipedia.org/wiki/%C3%89lision} + * + * Note that StandardTokenizer sees "’" as a space, and cuts it out. + */ +public class ElisionFilter extends TokenFilter { + private Set articles = null; + + private static String apostrophes = "'’"; + + public void setArticles(Set articles) { + this.articles = new HashSet(); + Iterator iter = articles.iterator(); + while (iter.hasNext()) { + this.articles.add(((String) iter.next()).toLowerCase()); + } + } + + /** + * Constructs an elision filter with standard stop words + */ + protected ElisionFilter(TokenStream input) { + super(input); + this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t", + "qu", "n", "s", "j" })); + } + + /** + * Constructs an elision filter with a Set of stop words + */ + public ElisionFilter(TokenStream input, Set articles) { + super(input); + setArticles(articles); + } + + /** + * Constructs an elision filter with an array of stop words + */ + public ElisionFilter(TokenStream input, String[] articles) { + super(input); + setArticles(new HashSet(Arrays.asList(articles))); + } + + /** + * Returns the next input Token whith termText() without elisioned start + */ + public Token next() throws IOException { + Token t = input.next(); + if (t == null) + return null; + String text = t.termText(); + System.out.println(text); + int minPoz = -1; + int poz; + for (int i = 0; i < apostrophes.length(); i++) { + poz = text.indexOf(apostrophes.charAt(i)); + if (poz != -1) + minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz); + } + if (minPoz != -1 + && articles.contains(text.substring(0, minPoz).toLowerCase())) + text = text.substring(minPoz + 1); + return new Token(text, t.startOffset(), t.endOffset(), t.type()); + } + +} diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java new file mode 100644 index 00000000000..bc9b255980a --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java @@ -0,0 +1,70 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * @author Mathieu Lecarme + * + */ +public class TestElision extends TestCase { + + public void testElision() { + String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; + Tokenizer tokenizer = new StandardTokenizer(new StringReader(test)); + Set articles = new HashSet(); + articles.add("l"); + articles.add("M"); + TokenFilter filter = new ElisionFilter(tokenizer, articles); + List tas = filtre(filter); + assertEquals("embrouille", tas.get(4)); + assertEquals("O'brian", tas.get(6)); + assertEquals("enfin", tas.get(7)); + } + + private List filtre(TokenFilter filter) { + List tas = new ArrayList(); + try { + boolean encore = true; + Token token; + while (encore) { + token = filter.next(); + encore = token != null; + if (token != null) + tas.add(token.termText()); + } + } catch (IOException e) { + e.printStackTrace(); + } + return tas; + } + +}