- LUCENE-906: Elision filter for French.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@551744 13f79535-47bb-0310-9956-ffa450edef68
2007-06-29 00:36:09 +00:00 · 2007-06-29 00:36:09 +00:00 · 71f2c1da8b
parent d9b6aa9c0b
commit 71f2c1da8b
3 changed files with 171 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -16,6 +16,9 @@ Bug fixes
    
 New features

+ 1. LUCENE-906: Elision filter for French.
+    (Mathieu Lecarme via Otis Gospodnetic)
+
 Optimizations

 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the 
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Arrays;
+import java.util.Iterator;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenFilter;
+
+/**
+ * Removes elisions from a token stream. For example, "l'avion" (the plane) will be
+ * tokenized as "avion" (plane).
+ * 
+ * @author Mathieu Lecarme<mlecarme@openwide.fr>
+ * @see{http://fr.wikipedia.org/wiki/%C3%89lision}
+ * 
+ * Note that StandardTokenizer sees "’" as a space, and cuts it out.
+ */
+public class ElisionFilter extends TokenFilter {
+  private Set articles = null;
+
+  private static String apostrophes = "'’";
+
+  public void setArticles(Set articles) {
+    this.articles = new HashSet();
+    Iterator iter = articles.iterator();
+    while (iter.hasNext()) {
+      this.articles.add(((String) iter.next()).toLowerCase());
+    }
+  }
+
+  /**
+   * Constructs an elision filter with standard stop words
+   */
+  protected ElisionFilter(TokenStream input) {
+    super(input);
+    this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t",
+        "qu", "n", "s", "j" }));
+  }
+
+  /**
+   * Constructs an elision filter with a Set of stop words
+   */
+  public ElisionFilter(TokenStream input, Set articles) {
+    super(input);
+    setArticles(articles);
+  }
+
+  /**
+   * Constructs an elision filter with an array of stop words
+   */
+  public ElisionFilter(TokenStream input, String[] articles) {
+    super(input);
+    setArticles(new HashSet(Arrays.asList(articles)));
+  }
+
+  /**
+   * Returns the next input Token whith termText() without elisioned start
+   */
+  public Token next() throws IOException {
+    Token t = input.next();
+    if (t == null)
+      return null;
+    String text = t.termText();
+    System.out.println(text);
+    int minPoz = -1;
+    int poz;
+    for (int i = 0; i < apostrophes.length(); i++) {
+      poz = text.indexOf(apostrophes.charAt(i));
+      if (poz != -1)
+        minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz);
+    }
+    if (minPoz != -1
+        && articles.contains(text.substring(0, minPoz).toLowerCase()))
+      text = text.substring(minPoz + 1);
+    return new Token(text, t.startOffset(), t.endOffset(), t.type());
+  }
+
+}
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/fr/TestElision.java
@ -0,0 +1,70 @@
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * @author Mathieu Lecarme<mlecarme@openwide.fr>
+ * 
+ */
+public class TestElision extends TestCase {
+
+  public void testElision() {
+    String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
+    Tokenizer tokenizer = new StandardTokenizer(new StringReader(test));
+    Set articles = new HashSet();
+    articles.add("l");
+    articles.add("M");
+    TokenFilter filter = new ElisionFilter(tokenizer, articles);
+    List tas = filtre(filter);
+    assertEquals("embrouille", tas.get(4));
+    assertEquals("O'brian", tas.get(6));
+    assertEquals("enfin", tas.get(7));
+  }
+
+  private List filtre(TokenFilter filter) {
+    List tas = new ArrayList();
+    try {
+      boolean encore = true;
+      Token token;
+      while (encore) {
+        token = filter.next();
+        encore = token != null;
+        if (token != null)
+          tas.add(token.termText());
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    return tas;
+  }
+
+}