Added KuromojiReadingFormFilter (LUCENE-3915)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1305046 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2012-03-25 14:17:23 +00:00
parent fb33754168
commit c3ddb9dc67
5 changed files with 196 additions and 2 deletions

View File

@ -176,6 +176,9 @@ New Features
with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
and the "text_ja" field type in schema.xml. (Christian Moen)
* LUCENE-3915: Add Japanese filter to replace a term attribute with its reading.
(Koji Sekiguchi, Robert Muir, Christian Moen)
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions.

View File

@ -0,0 +1,65 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
import java.io.IOException;
/**
* A {@link org.apache.lucene.analysis.TokenFilter} that replaces the term
* attribute with the reading of a token in either katakana or romaji form.
* The default reading form is katakana.
*/
public final class KuromojiReadingFormFilter extends TokenFilter {
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
private boolean useRomaji;
public KuromojiReadingFormFilter(TokenStream input, boolean useRomaji) {
super(input);
this.useRomaji = useRomaji;
}
public KuromojiReadingFormFilter(TokenStream input) {
this(input, false);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String reading = readingAttr.getReading();
if (reading != null) {
if (useRomaji) {
ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
} else {
termAttr.setEmpty().append(reading);
}
}
return true;
} else {
return false;
}
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kuromoji.util;
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
/**
@ -239,7 +240,19 @@ public class ToStringUtil {
* Romanize katakana with modified hepburn
*/
public static String getRomanization(String s) {
StringBuilder builder = new StringBuilder();
StringBuilder out = new StringBuilder();
try {
getRomanization(out, s);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return out.toString();
}
/**
* Romanize katakana with modified hepburn
*/
public static void getRomanization(Appendable builder, CharSequence s) throws IOException {
final int len = s.length();
for (int i = 0; i < len; i++) {
// maximum lookahead: 3
@ -1022,6 +1035,5 @@ public class ToStringUtil {
builder.append(ch);
}
}
return builder.toString();
}
}

View File

@ -0,0 +1,64 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import java.io.IOException;
import java.io.Reader;
/**
* Tests for {@link TestKuromojiReadingFormFilter}
*/
public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase {
private Analyzer katakanaAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, false));
}
};
private Analyzer romajiAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, true));
}
};
public void testKatakanaReadings() throws IOException {
assertAnalyzesTo(katakanaAnalyzer, "今夜はロバート先生と話した",
new String[] { "コンヤ", "", "ロバート", "センセイ", "", "ハナシ", "" }
);
}
public void testRomajiReadings() throws IOException {
assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
);
}
public void testRandomData() throws IOException {
checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER);
checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,50 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kuromoji.KuromojiReadingFormFilter;
import java.util.Map;
/**
* Factory for {@link KuromojiReadingFormFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ja" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
* &lt;filter class="solr.KuromojiReadingFormFilterFactory"
* useRomaji="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*/
public class KuromojiReadingFormFilterFactory extends BaseTokenFilterFactory {
private static final String ROMAJI_PARAM = "useRomaji";
private boolean useRomaji;
@Override
public void init(Map<String, String> args) {
super.init(args);
useRomaji = getBoolean(ROMAJI_PARAM, false);
}
public TokenStream create(TokenStream input) {
return new KuromojiReadingFormFilter(input, useRomaji);
}
}