diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index ec35d87d0b1..66d5573f917 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -172,6 +172,10 @@ New Features * LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen) + * LUCENE-3901: Added katakana stem filter to normalize common spelling variants + with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer + and the "text_ja" field type in schema.xml. (Christian Moen) + * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do joins in both parent to child and child to parent directions. diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java index e32ba592fb4..3d758f8486d 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java @@ -92,6 +92,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase { stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); stream = new StopFilter(matchVersion, stream, stopwords); + stream = new KuromojiKatakanaStemFilter(stream); stream = new LowerCaseFilter(matchVersion, stream); return new TokenStreamComponents(tokenizer, stream); } diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java new file mode 100644 index 00000000000..37a60d79882 --- /dev/null +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.kuromoji; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +/** + * A {@link TokenFilter} that normalizes common katakana spelling variations + * ending in a long sound character by removing this character (U+30FC). Only + * katakana words longer than a minimum length are stemmed (default is four). + *

+ * Note that only full-width katakana characters are supported. Please use a + * {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} to convert half-width + * katakana to full-width before using this filter. + *

+ *

+ * In order to prevent terms from being stemmed, use an instance of + * {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter} + * or a custom {@link TokenFilter} that sets the {@link KeywordAttribute} + * before this {@link TokenStream}. + *

+ */ + +public final class KuromojiKatakanaStemFilter extends TokenFilter { + public final static int DEFAULT_MINIMUM_LENGTH = 4; + private final static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc'; + + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + private final int minimumKatakanaLength; + + public KuromojiKatakanaStemFilter(TokenStream input, int minimumLength) { + super(input); + this.minimumKatakanaLength = minimumLength; + } + + public KuromojiKatakanaStemFilter(TokenStream input) { + this(input, DEFAULT_MINIMUM_LENGTH); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + termAttr.setLength(stem(termAttr.buffer(), termAttr.length())); + } + return true; + } else { + return false; + } + } + + private int stem(char[] term, int length) { + if (length < minimumKatakanaLength) { + return length; + } + + if (! isKatakana(term, length)) { + return length; + } + + if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) { + return length - 1; + } + + return length; + } + + private boolean isKatakana(char[] term, int length) { + for (int i = 0; i < length; i++) { + // NOTE: Test only identifies full-width characters -- half-widths are supported + if (Character.UnicodeBlock.of(term[i]) != Character.UnicodeBlock.KATAKANA) { + return false; + } + } + return true; + } +} diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java index f98b4e163d9..d39cf875bf8 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java @@ -24,6 +24,9 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; +/** + * Test Kuromoji Japanese morphological analyzer + */ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the * stopwords file is missing in classpath */ @@ -54,27 +57,26 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase { KuromojiAnalyzer.getDefaultStopSet(), KuromojiAnalyzer.getDefaultStopTags()); - /* - //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。")); - TokenStream ts = a.tokenStream("foo", new StringReader("�?>-->;")); - ts.reset(); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - while(ts.incrementToken()) { - System.out.println(" " + termAtt.toString()); - } - System.out.println("DONE PARSE\n\n"); - */ - // Senior software engineer: assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア", new String[] { "シニア", - "シニアソフトウェアエンジニア", + "シニアソフトウェアエンジニア", // zero pos inc "ソフトウェア", "エンジニア" }, new int[] { 1, 0, 1, 1}, new int[] { 1, 3, 1, 1} ); + // Senior project manager: also tests katakana spelling variation stemming + assertAnalyzesToPositions(a, "シニアプロジェクトマネージャー", + new String[] { "シニア", + "シニアプロジェクトマネージャ", // trailing ー removed by stemming, zero pos inc + "プロジェクト", + "マネージャ"}, // trailing ー removed by stemming + new int[]{1, 0, 1, 1}, + new int[]{1, 3, 1, 1} + ); + // Kansai International Airport: assertAnalyzesToPositions(a, "関西国際空港", new String[] { "関西", diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java new file mode 100644 index 00000000000..33ab248d7c6 --- /dev/null +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java @@ -0,0 +1,71 @@ +package org.apache.lucene.analysis.kuromoji; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Tests for {@link org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter} + */ +public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + // Use a MockTokenizer here since this filter doesn't really depend on Kuromoji + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new KuromojiKatakanaStemFilter(source)); + } + }; + + /** + * Test a few common katakana spelling variations. + *

+ * English translations are as follows: + *

+ * Note that we remove a long sound in the case of "coffee" that is required. + *

+ */ + public void testStemVariants() throws IOException { + assertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター", + new String[] { "コピー", "コーヒ", "タクシ", "パーティ", "パーティ", "センタ" }, + new int[] { 0, 4, 9, 14, 20, 25 }, + new int[] { 3, 8, 13, 19, 24, 29 }); + } + + public void testUnsupportedHalfWidthVariants() throws IOException { + // The below result is expected since only full-width katakana is supported + assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" }); + } + + public void testRandomData() throws IOException { + checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); + } +} diff --git a/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java new file mode 100644 index 00000000000..f1748160cdd --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java @@ -0,0 +1,55 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter; +import org.apache.solr.common.SolrException; + +import java.util.Map; + +/** + * Factory for {@link KuromojiKatakanaStemFilterFactory}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.KuromojiTokenizerFactory"/>
+ *     <filter class="solr.KuromojiKatakanaStemFilterFactory"
+ *             minimumLength="4"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class KuromojiKatakanaStemFilterFactory extends BaseTokenFilterFactory { + private static final String MINIMUM_LENGTH_PARAM = "minimumLength"; + private int minimumLength; + + @Override + public void init(Map args) { + super.init(args); + minimumLength = getInt(MINIMUM_LENGTH_PARAM, KuromojiKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH); + if (minimumLength < 2) { + throw new SolrException(SolrException.ErrorCode.UNKNOWN, + "Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)"); + } + } + + public TokenStream create(TokenStream input) { + return new KuromojiKatakanaStemFilter(input, minimumLength); + } +} diff --git a/solr/example/solr/conf/schema.xml b/solr/example/solr/conf/schema.xml index 4d9aa22d069..f2425593bca 100755 --- a/solr/example/solr/conf/schema.xml +++ b/solr/example/solr/conf/schema.xml @@ -504,13 +504,13 @@ - + - + @@ -720,7 +720,9 @@ - + + +