mirror of https://github.com/apache/lucene.git
Added katakana stem filter (LUCENE-3901)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304719 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e0141c7350
commit
63f1c48b7d
|
@ -172,6 +172,10 @@ New Features
|
||||||
* LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words
|
* LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words
|
||||||
and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)
|
and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)
|
||||||
|
|
||||||
|
* LUCENE-3901: Added katakana stem filter to normalize common spelling variants
|
||||||
|
with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
|
||||||
|
and the "text_ja" field type in schema.xml. (Christian Moen)
|
||||||
|
|
||||||
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
|
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
|
||||||
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
|
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
|
||||||
joins in both parent to child and child to parent directions.
|
joins in both parent to child and child to parent directions.
|
||||||
|
|
|
@ -92,6 +92,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
|
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
|
||||||
stream = new CJKWidthFilter(stream);
|
stream = new CJKWidthFilter(stream);
|
||||||
stream = new StopFilter(matchVersion, stream, stopwords);
|
stream = new StopFilter(matchVersion, stream, stopwords);
|
||||||
|
stream = new KuromojiKatakanaStemFilter(stream);
|
||||||
stream = new LowerCaseFilter(matchVersion, stream);
|
stream = new LowerCaseFilter(matchVersion, stream);
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
package org.apache.lucene.analysis.kuromoji;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that normalizes common katakana spelling variations
|
||||||
|
* ending in a long sound character by removing this character (U+30FC). Only
|
||||||
|
* katakana words longer than a minimum length are stemmed (default is four).
|
||||||
|
* <p>
|
||||||
|
* Note that only full-width katakana characters are supported. Please use a
|
||||||
|
* {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} to convert half-width
|
||||||
|
* katakana to full-width before using this filter.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* In order to prevent terms from being stemmed, use an instance of
|
||||||
|
* {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
|
||||||
|
* or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
|
||||||
|
* before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
|
||||||
|
public final class KuromojiKatakanaStemFilter extends TokenFilter {
|
||||||
|
public final static int DEFAULT_MINIMUM_LENGTH = 4;
|
||||||
|
private final static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc';
|
||||||
|
|
||||||
|
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
private final int minimumKatakanaLength;
|
||||||
|
|
||||||
|
public KuromojiKatakanaStemFilter(TokenStream input, int minimumLength) {
|
||||||
|
super(input);
|
||||||
|
this.minimumKatakanaLength = minimumLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
public KuromojiKatakanaStemFilter(TokenStream input) {
|
||||||
|
this(input, DEFAULT_MINIMUM_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
termAttr.setLength(stem(termAttr.buffer(), termAttr.length()));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int stem(char[] term, int length) {
|
||||||
|
if (length < minimumKatakanaLength) {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (! isKatakana(term, length)) {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) {
|
||||||
|
return length - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isKatakana(char[] term, int length) {
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
// NOTE: Test only identifies full-width characters -- half-widths are supported
|
||||||
|
if (Character.UnicodeBlock.of(term[i]) != Character.UnicodeBlock.KATAKANA) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -24,6 +24,9 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test Kuromoji Japanese morphological analyzer
|
||||||
|
*/
|
||||||
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
|
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
|
@ -54,27 +57,26 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
KuromojiAnalyzer.getDefaultStopSet(),
|
KuromojiAnalyzer.getDefaultStopSet(),
|
||||||
KuromojiAnalyzer.getDefaultStopTags());
|
KuromojiAnalyzer.getDefaultStopTags());
|
||||||
|
|
||||||
/*
|
|
||||||
//TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
|
|
||||||
TokenStream ts = a.tokenStream("foo", new StringReader("�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
|
|
||||||
ts.reset();
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
while(ts.incrementToken()) {
|
|
||||||
System.out.println(" " + termAtt.toString());
|
|
||||||
}
|
|
||||||
System.out.println("DONE PARSE\n\n");
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Senior software engineer:
|
// Senior software engineer:
|
||||||
assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
|
assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
|
||||||
new String[] { "シニア",
|
new String[] { "シニア",
|
||||||
"シニアソフトウェアエンジニア",
|
"シニアソフトウェアエンジニア", // zero pos inc
|
||||||
"ソフトウェア",
|
"ソフトウェア",
|
||||||
"エンジニア" },
|
"エンジニア" },
|
||||||
new int[] { 1, 0, 1, 1},
|
new int[] { 1, 0, 1, 1},
|
||||||
new int[] { 1, 3, 1, 1}
|
new int[] { 1, 3, 1, 1}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Senior project manager: also tests katakana spelling variation stemming
|
||||||
|
assertAnalyzesToPositions(a, "シニアプロジェクトマネージャー",
|
||||||
|
new String[] { "シニア",
|
||||||
|
"シニアプロジェクトマネージャ", // trailing ー removed by stemming, zero pos inc
|
||||||
|
"プロジェクト",
|
||||||
|
"マネージャ"}, // trailing ー removed by stemming
|
||||||
|
new int[]{1, 0, 1, 1},
|
||||||
|
new int[]{1, 3, 1, 1}
|
||||||
|
);
|
||||||
|
|
||||||
// Kansai International Airport:
|
// Kansai International Airport:
|
||||||
assertAnalyzesToPositions(a, "関西国際空港",
|
assertAnalyzesToPositions(a, "関西国際空港",
|
||||||
new String[] { "関西",
|
new String[] { "関西",
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
package org.apache.lucene.analysis.kuromoji;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for {@link org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
// Use a MockTokenizer here since this filter doesn't really depend on Kuromoji
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(source, new KuromojiKatakanaStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test a few common katakana spelling variations.
|
||||||
|
* <p>
|
||||||
|
* English translations are as follows:
|
||||||
|
* <ul>
|
||||||
|
* <li>copy</li>
|
||||||
|
* <li>coffee</li>
|
||||||
|
* <li>taxi</li>
|
||||||
|
* <li>party</li>
|
||||||
|
* <li>party (without long sound)</li>
|
||||||
|
* <li>center</li>
|
||||||
|
* </ul>
|
||||||
|
* Note that we remove a long sound in the case of "coffee" that is required.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public void testStemVariants() throws IOException {
|
||||||
|
assertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター",
|
||||||
|
new String[] { "コピー", "コーヒ", "タクシ", "パーティ", "パーティ", "センタ" },
|
||||||
|
new int[] { 0, 4, 9, 14, 20, 25 },
|
||||||
|
new int[] { 3, 8, 13, 19, 24, 29 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUnsupportedHalfWidthVariants() throws IOException {
|
||||||
|
// The below result is expected since only full-width katakana is supported
|
||||||
|
assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomData() throws IOException {
|
||||||
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link KuromojiKatakanaStemFilterFactory}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_ja" class="solr.TextField">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.KuromojiTokenizerFactory"/>
|
||||||
|
* <filter class="solr.KuromojiKatakanaStemFilterFactory"
|
||||||
|
* minimumLength="4"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType>
|
||||||
|
* </pre>
|
||||||
|
*/
|
||||||
|
public class KuromojiKatakanaStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
private static final String MINIMUM_LENGTH_PARAM = "minimumLength";
|
||||||
|
private int minimumLength;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String, String> args) {
|
||||||
|
super.init(args);
|
||||||
|
minimumLength = getInt(MINIMUM_LENGTH_PARAM, KuromojiKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
|
||||||
|
if (minimumLength < 2) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.UNKNOWN,
|
||||||
|
"Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new KuromojiKatakanaStemFilter(input, minimumLength);
|
||||||
|
}
|
||||||
|
}
|
|
@ -720,7 +720,9 @@
|
||||||
<filter class="solr.CJKWidthFilterFactory"/>
|
<filter class="solr.CJKWidthFilterFactory"/>
|
||||||
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
|
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
|
||||||
<!-- Lower-case romaji characters -->
|
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
|
||||||
|
<filter class="solr.KuromojiKatakanaStemFilterFactory" minimumLength="4"/>
|
||||||
|
<!-- Lower-cases romaji characters -->
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
Loading…
Reference in New Issue