Added katakana stem filter (LUCENE-3901)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304719 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2012-03-24 06:38:53 +00:00
parent e0141c7350
commit 63f1c48b7d
7 changed files with 248 additions and 15 deletions

View File

@ -172,6 +172,10 @@ New Features
* LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words * LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words
and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen) and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)
* LUCENE-3901: Added katakana stem filter to normalize common spelling variants
with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
and the "text_ja" field type in schema.xml. (Christian Moen)
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions. joins in both parent to child and child to parent directions.

View File

@ -92,6 +92,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags); stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new CJKWidthFilter(stream); stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords); stream = new StopFilter(matchVersion, stream, stopwords);
stream = new KuromojiKatakanaStemFilter(stream);
stream = new LowerCaseFilter(matchVersion, stream); stream = new LowerCaseFilter(matchVersion, stream);
return new TokenStreamComponents(tokenizer, stream); return new TokenStreamComponents(tokenizer, stream);
} }

View File

@ -0,0 +1,98 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import java.io.IOException;
/**
* A {@link TokenFilter} that normalizes common katakana spelling variations
* ending in a long sound character by removing this character (U+30FC). Only
* katakana words longer than a minimum length are stemmed (default is four).
* <p>
* Note that only full-width katakana characters are supported. Please use a
* {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} to convert half-width
* katakana to full-width before using this filter.
* </p>
* <p>
* In order to prevent terms from being stemmed, use an instance of
* {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
* or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
* before this {@link TokenStream}.
* </p>
*/
public final class KuromojiKatakanaStemFilter extends TokenFilter {
public final static int DEFAULT_MINIMUM_LENGTH = 4;
private final static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc';
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final int minimumKatakanaLength;
public KuromojiKatakanaStemFilter(TokenStream input, int minimumLength) {
super(input);
this.minimumKatakanaLength = minimumLength;
}
public KuromojiKatakanaStemFilter(TokenStream input) {
this(input, DEFAULT_MINIMUM_LENGTH);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
termAttr.setLength(stem(termAttr.buffer(), termAttr.length()));
}
return true;
} else {
return false;
}
}
private int stem(char[] term, int length) {
if (length < minimumKatakanaLength) {
return length;
}
if (! isKatakana(term, length)) {
return length;
}
if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) {
return length - 1;
}
return length;
}
private boolean isKatakana(char[] term, int length) {
for (int i = 0; i < length; i++) {
// NOTE: Test only identifies full-width characters -- half-widths are supported
if (Character.UnicodeBlock.of(term[i]) != Character.UnicodeBlock.KATAKANA) {
return false;
}
}
return true;
}
}

View File

@ -24,6 +24,9 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
/**
* Test Kuromoji Japanese morphological analyzer
*/
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase { public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
* stopwords file is missing in classpath */ * stopwords file is missing in classpath */
@ -54,27 +57,26 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
KuromojiAnalyzer.getDefaultStopSet(), KuromojiAnalyzer.getDefaultStopSet(),
KuromojiAnalyzer.getDefaultStopTags()); KuromojiAnalyzer.getDefaultStopTags());
/*
//TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
TokenStream ts = a.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
ts.reset();
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while(ts.incrementToken()) {
System.out.println(" " + termAtt.toString());
}
System.out.println("DONE PARSE\n\n");
*/
// Senior software engineer: // Senior software engineer:
assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア", assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
new String[] { "シニア", new String[] { "シニア",
"シニアソフトウェアエンジニア", "シニアソフトウェアエンジニア", // zero pos inc
"ソフトウェア", "ソフトウェア",
"エンジニア" }, "エンジニア" },
new int[] { 1, 0, 1, 1}, new int[] { 1, 0, 1, 1},
new int[] { 1, 3, 1, 1} new int[] { 1, 3, 1, 1}
); );
// Senior project manager: also tests katakana spelling variation stemming
assertAnalyzesToPositions(a, "シニアプロジェクトマネージャー",
new String[] { "シニア",
"シニアプロジェクトマネージャ", // trailing removed by stemming, zero pos inc
"プロジェクト",
"マネージャ"}, // trailing removed by stemming
new int[]{1, 0, 1, 1},
new int[]{1, 3, 1, 1}
);
// Kansai International Airport: // Kansai International Airport:
assertAnalyzesToPositions(a, "関西国際空港", assertAnalyzesToPositions(a, "関西国際空港",
new String[] { "関西", new String[] { "関西",

View File

@ -0,0 +1,71 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import java.io.IOException;
import java.io.Reader;
/**
* Tests for {@link org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter}
*/
public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
// Use a MockTokenizer here since this filter doesn't really depend on Kuromoji
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new KuromojiKatakanaStemFilter(source));
}
};
/**
* Test a few common katakana spelling variations.
* <p>
* English translations are as follows:
* <ul>
* <li>copy</li>
* <li>coffee</li>
* <li>taxi</li>
* <li>party</li>
* <li>party (without long sound)</li>
* <li>center</li>
* </ul>
* Note that we remove a long sound in the case of "coffee" that is required.
* </p>
*/
public void testStemVariants() throws IOException {
assertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター",
new String[] { "コピー", "コーヒ", "タクシ", "パーティ", "パーティ", "センタ" },
new int[] { 0, 4, 9, 14, 20, 25 },
new int[] { 3, 8, 13, 19, 24, 29 });
}
public void testUnsupportedHalfWidthVariants() throws IOException {
// The below result is expected since only full-width katakana is supported
assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });
}
public void testRandomData() throws IOException {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,55 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter;
import org.apache.solr.common.SolrException;
import java.util.Map;
/**
* Factory for {@link KuromojiKatakanaStemFilterFactory}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ja" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
* &lt;filter class="solr.KuromojiKatakanaStemFilterFactory"
* minimumLength="4"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*/
public class KuromojiKatakanaStemFilterFactory extends BaseTokenFilterFactory {
private static final String MINIMUM_LENGTH_PARAM = "minimumLength";
private int minimumLength;
@Override
public void init(Map<String, String> args) {
super.init(args);
minimumLength = getInt(MINIMUM_LENGTH_PARAM, KuromojiKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
if (minimumLength < 2) {
throw new SolrException(SolrException.ErrorCode.UNKNOWN,
"Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)");
}
}
public TokenStream create(TokenStream input) {
return new KuromojiKatakanaStemFilter(input, minimumLength);
}
}

View File

@ -720,7 +720,9 @@
<filter class="solr.CJKWidthFilterFactory"/> <filter class="solr.CJKWidthFilterFactory"/>
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
<!-- Lower-case romaji characters --> <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
<filter class="solr.KuromojiKatakanaStemFilterFactory" minimumLength="4"/>
<!-- Lower-cases romaji characters -->
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
</analyzer> </analyzer>
</fieldType> </fieldType>