Added katakana stem filter (LUCENE-3901)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304719 13f79535-47bb-0310-9956-ffa450edef68
2025-03-01 05:49:33 +00:00 · 2012-03-24 06:38:53 +00:00 · 2012-03-24 06:38:53 +00:00 · 63f1c48b7d
commit 63f1c48b7d
parent e0141c7350
7 changed files with 248 additions and 15 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -172,6 +172,10 @@ New Features
 * LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words 
   and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)

+ * LUCENE-3901: Added katakana stem filter to normalize common spelling variants
+   with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
+   and the "text_ja" field type in schema.xml. (Christian Moen)
+
 * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
   BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
   joins in both parent to child and child to parent directions.
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
@ -92,6 +92,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
    stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    stream = new StopFilter(matchVersion, stream, stopwords);
+    stream = new KuromojiKatakanaStemFilter(stream);
    stream = new LowerCaseFilter(matchVersion, stream);
    return new TokenStreamComponents(tokenizer, stream);
  }
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java
@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+/**
+ * A {@link TokenFilter} that normalizes common katakana spelling variations
+ * ending in a long sound character by removing this character (U+30FC).  Only
+ * katakana words longer than a minimum length are stemmed (default is four).
+ * <p>
+ * Note that only full-width katakana characters are supported.  Please use a
+ * {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} to convert half-width
+ * katakana to full-width before using this filter.
+ * </p>
+ * <p>
+ * In order to prevent terms from being stemmed, use an instance of
+ * {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
+ * or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
+ * before this {@link TokenStream}.
+ * </p>
+ */
+
+public final class KuromojiKatakanaStemFilter extends TokenFilter {
+  public final static int DEFAULT_MINIMUM_LENGTH = 4;
+  private final static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc';
+
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+  private final int minimumKatakanaLength;
+
+  public KuromojiKatakanaStemFilter(TokenStream input, int minimumLength) {
+    super(input);
+    this.minimumKatakanaLength = minimumLength;
+  }
+
+  public KuromojiKatakanaStemFilter(TokenStream input) {
+    this(input, DEFAULT_MINIMUM_LENGTH);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        termAttr.setLength(stem(termAttr.buffer(), termAttr.length()));
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  private int stem(char[] term, int length) {
+    if (length < minimumKatakanaLength) {
+      return length;
+    }
+
+    if (! isKatakana(term, length)) {
+      return length;
+    }
+
+    if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) {
+      return length - 1;
+    }
+
+    return length;
+  }
+
+  private boolean isKatakana(char[] term, int length) {
+    for (int i = 0; i < length; i++) {
+      // NOTE: Test only identifies full-width characters -- half-widths are supported
+      if (Character.UnicodeBlock.of(term[i]) != Character.UnicodeBlock.KATAKANA) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
--- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
+++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
@ -24,6 +24,9 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;

+/**
+ * Test Kuromoji Japanese morphological analyzer
+ */
 public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
  /** This test fails with NPE when the 
   * stopwords file is missing in classpath */
@ -54,27 +57,26 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
                                            KuromojiAnalyzer.getDefaultStopSet(),
                                            KuromojiAnalyzer.getDefaultStopTags());

-    /*
-    //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
-    TokenStream ts = a.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
-    ts.reset();
-    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
-    while(ts.incrementToken()) {
-      System.out.println("  " + termAtt.toString());
-    }
-    System.out.println("DONE PARSE\n\n");
-    */
-
    // Senior software engineer:
    assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
                              new String[] { "シニア",
-                                             "シニアソフトウェアエンジニア",
+                                             "シニアソフトウェアエンジニア", // zero pos inc
                                             "ソフトウェア",
                                             "エンジニア" },
                              new int[] { 1, 0, 1, 1},
                              new int[] { 1, 3, 1, 1}
                              );

+    // Senior project manager: also tests katakana spelling variation stemming
+    assertAnalyzesToPositions(a, "シニアプロジェクトマネージャー",
+                              new String[] { "シニア",
+                                              "シニアプロジェクトマネージャ", // trailing ー removed by stemming, zero pos inc
+                                              "プロジェクト",
+                                              "マネージャ"}, // trailing ー removed by stemming
+                              new int[]{1, 0, 1, 1},
+                              new int[]{1, 3, 1, 1}
+                              );
+
    // Kansai International Airport:
    assertAnalyzesToPositions(a, "関西国際空港",
                              new String[] { "関西",
--- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java
+++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java
@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Tests for {@link org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter}
+ */
+public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      // Use a MockTokenizer here since this filter doesn't really depend on Kuromoji
+      Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      return new TokenStreamComponents(source, new KuromojiKatakanaStemFilter(source));
+    }
+  };
+  
+  /**
+   * Test a few common katakana spelling variations.
+   * <p>
+   * English translations are as follows:
+   * <ul>
+   *   <li>copy</li>
+   *   <li>coffee</li>
+   *   <li>taxi</li>
+   *   <li>party</li>
+   *   <li>party (without long sound)</li>
+   *   <li>center</li>
+   * </ul>
+   * Note that we remove a long sound in the case of "coffee" that is required.
+   * </p>
+   */
+  public void testStemVariants() throws IOException {
+    assertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター",
+      new String[] { "コピー",  "コーヒ", "タクシ", "パーティ", "パーティ", "センタ" },
+      new int[] { 0, 4,  9, 14, 20, 25 },
+      new int[] { 3, 8, 13, 19, 24, 29 });
+  }
+
+  public void testUnsupportedHalfWidthVariants() throws IOException {
+    // The below result is expected since only full-width katakana is supported
+    assertAnalyzesTo(analyzer, "ﾀｸｼｰ", new String[] { "ﾀｸｼｰ" });
+  }
+  
+  public void testRandomData() throws IOException {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
+}
--- a/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java
@ -0,0 +1,55 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter;
+import org.apache.solr.common.SolrException;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link KuromojiKatakanaStemFilterFactory}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.KuromojiKatakanaStemFilterFactory"
+ *             minimumLength="4"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ */
+public class KuromojiKatakanaStemFilterFactory extends BaseTokenFilterFactory {
+  private static final String MINIMUM_LENGTH_PARAM = "minimumLength";
+  private int minimumLength;
+  
+  @Override
+  public void init(Map<String, String> args) {
+    super.init(args);
+    minimumLength = getInt(MINIMUM_LENGTH_PARAM, KuromojiKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
+    if (minimumLength < 2) {
+      throw new SolrException(SolrException.ErrorCode.UNKNOWN,
+                              "Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)");
+    }
+  }
+
+  public TokenStream create(TokenStream input) {
+    return new KuromojiKatakanaStemFilter(input, minimumLength);
+  }
+}
--- a/solr/example/solr/conf/schema.xml
+++ b/solr/example/solr/conf/schema.xml
@ -504,13 +504,13 @@
    
    <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) -->
    <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
-      <analyzer> 
+      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <!-- normalize width before bigram, as e.g. half-width dakuten combine  -->
        <filter class="solr.CJKWidthFilterFactory"/>
        <!-- for any non-CJK -->
        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.CJKBigramFilterFactory"/>     
+        <filter class="solr.CJKBigramFilterFactory"/>
      </analyzer>
    </fieldType>

@ -720,7 +720,9 @@
        <filter class="solr.CJKWidthFilterFactory"/>
        <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
-        <!-- Lower-case romaji characters -->
+        <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
+        <filter class="solr.KuromojiKatakanaStemFilterFactory" minimumLength="4"/>
+        <!-- Lower-cases romaji characters -->
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>