LUCENE-5482: Improve default TurkishAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1573066 13f79535-47bb-0310-9956-ffa450edef68
2014-02-28 20:41:32 +00:00 · 2014-02-28 20:41:32 +00:00 · 4193bce372
parent 2356994e0c
commit 4193bce372
9 changed files with 205 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -81,6 +81,9 @@ New Features
  additions and updates (to change weight or payload of an existing
  suggestion).  (Mike McCandless)

+* LUCENE-5482: Improve default TurkishAnalyzer by adding apostrophe
+  handling suitable for Turkish.  (Ahmet Arslan via Robert Muir)
+
 API Changes

 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/ApostropheFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/ApostropheFilter.java
@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.tr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.IOException;
+
+/**
+ * Strips all characters after an apostrophe (including the apostrophe itself).
+ * <p>
+ * In Turkish, apostrophe is used to separate suffixes from proper names
+ * (continent, sea, river, lake, mountain, upland, proper names related to
+ * religion and mythology). This filter intended to be used before stem filters.
+ * For more information, see <a href="http://www.ipcsit.com/vol57/015-ICNI2012-M021.pdf">
+ * Role of Apostrophes in Turkish Information Retrieval</a>
+ * </p>
+ */
+public final class ApostropheFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  public ApostropheFilter(TokenStream in) {
+    super(in);
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (!input.incrementToken())
+      return false;
+
+    final char[] buffer = termAtt.buffer();
+    final int length = termAtt.length();
+
+    for (int i = 0; i < length; i++)
+      if (buffer[i] == '\'' || buffer[i] == '\u2019') {
+        termAtt.setLength(i);
+        return true;
+      }
+    return true;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/ApostropheFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/ApostropheFilterFactory.java
@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.tr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link ApostropheFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_tr_lower_apostrophes" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.ApostropheFilterFactory"/&gt;
+ *     &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class ApostropheFilterFactory extends TokenFilterFactory {
+
+  public ApostropheFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameter(s): " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new ApostropheFilter(input);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
@ -122,6 +122,8 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
  protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer(matchVersion);
    TokenStream result = new StandardFilter(matchVersion, source);
+    if(matchVersion.onOrAfter(Version.LUCENE_48))
+      result = new ApostropheFilter(result);
    result = new TurkishLowerCaseFilter(result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.

+org.apache.lucene.analysis.tr.ApostropheFilterFactory
 org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
 org.apache.lucene.analysis.ar.ArabicStemFilterFactory
 org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestApostropheFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestApostropheFilter.java
@ -0,0 +1,32 @@
+package org.apache.lucene.analysis.tr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+
+
+public class TestApostropheFilter extends BaseTokenStreamTestCase {
+
+  public void testApostropheFilter() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("Türkiye'de 2003'te Van Gölü'nü gördüm");
+    stream = new TurkishLowerCaseFilter(stream);
+    stream = new ApostropheFilter(stream);
+    assertTokenStreamContents(stream, new String[]{"türkiye", "2003", "van", "gölü", "gördüm"});
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestApostropheFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestApostropheFilterFactory.java
@ -0,0 +1,55 @@
+package org.apache.lucene.analysis.tr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * Simple tests to ensure the apostrophe filter factory is working.
+ */
+public class TestApostropheFilterFactory extends BaseTokenStreamFactoryTestCase {
+  /**
+   * Ensure the filter actually removes characters after an apostrophe.
+   */
+  public void testApostrophes() throws Exception {
+    Reader reader = new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer) stream).setReader(reader);
+    stream = tokenFilterFactory("Apostrophe").create(stream);
+    assertTokenStreamContents(stream, new String[]{"Türkiye", "2003", "Van", "Gölü", "gördüm"});
+  }
+
+  /**
+   * Test that bogus arguments result in exception
+   */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("Apostrophe", "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameter(s):"));
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
@ -38,6 +38,9 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
    checkOneTerm(a, "ağaç", "ağaç");
    // stopword
    assertAnalyzesTo(a, "dolayı", new String[] {});
+    // apostrophes
+    checkOneTerm(a, "Kıbrıs'ta", "kıbrıs");
+    assertAnalyzesTo(a, "Van Gölü'ne", new String[]{"van", "göl"});
  }
  
  /** test use of exclusion set */
--- a/solr/example/solr/collection1/conf/schema.xml
+++ b/solr/example/solr/collection1/conf/schema.xml
@ -1124,6 +1124,7 @@
    <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
      <analyzer> 
        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.ApostropheFilterFactory"/>
        <filter class="solr.TurkishLowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
        <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>