mirror of https://github.com/apache/lucene.git
Added KuromojiReadingFormFilter (LUCENE-3915)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1305046 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fb33754168
commit
c3ddb9dc67
|
@ -176,6 +176,9 @@ New Features
|
||||||
with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
|
with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
|
||||||
and the "text_ja" field type in schema.xml. (Christian Moen)
|
and the "text_ja" field type in schema.xml. (Christian Moen)
|
||||||
|
|
||||||
|
* LUCENE-3915: Add Japanese filter to replace a term attribute with its reading.
|
||||||
|
(Koji Sekiguchi, Robert Muir, Christian Moen)
|
||||||
|
|
||||||
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
|
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
|
||||||
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
|
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
|
||||||
joins in both parent to child and child to parent directions.
|
joins in both parent to child and child to parent directions.
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
package org.apache.lucene.analysis.kuromoji;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
|
||||||
|
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link org.apache.lucene.analysis.TokenFilter} that replaces the term
|
||||||
|
* attribute with the reading of a token in either katakana or romaji form.
|
||||||
|
* The default reading form is katakana.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public final class KuromojiReadingFormFilter extends TokenFilter {
|
||||||
|
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
||||||
|
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
|
||||||
|
|
||||||
|
private boolean useRomaji;
|
||||||
|
|
||||||
|
public KuromojiReadingFormFilter(TokenStream input, boolean useRomaji) {
|
||||||
|
super(input);
|
||||||
|
this.useRomaji = useRomaji;
|
||||||
|
}
|
||||||
|
|
||||||
|
public KuromojiReadingFormFilter(TokenStream input) {
|
||||||
|
this(input, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
String reading = readingAttr.getReading();
|
||||||
|
if (reading != null) {
|
||||||
|
if (useRomaji) {
|
||||||
|
ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
|
||||||
|
} else {
|
||||||
|
termAttr.setEmpty().append(reading);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kuromoji.util;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -239,7 +240,19 @@ public class ToStringUtil {
|
||||||
* Romanize katakana with modified hepburn
|
* Romanize katakana with modified hepburn
|
||||||
*/
|
*/
|
||||||
public static String getRomanization(String s) {
|
public static String getRomanization(String s) {
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder out = new StringBuilder();
|
||||||
|
try {
|
||||||
|
getRomanization(out, s);
|
||||||
|
} catch (IOException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
|
return out.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Romanize katakana with modified hepburn
|
||||||
|
*/
|
||||||
|
public static void getRomanization(Appendable builder, CharSequence s) throws IOException {
|
||||||
final int len = s.length();
|
final int len = s.length();
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
// maximum lookahead: 3
|
// maximum lookahead: 3
|
||||||
|
@ -1022,6 +1035,5 @@ public class ToStringUtil {
|
||||||
builder.append(ch);
|
builder.append(ch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return builder.toString();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
package org.apache.lucene.analysis.kuromoji;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for {@link TestKuromojiReadingFormFilter}
|
||||||
|
*/
|
||||||
|
public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer katakanaAnalyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH);
|
||||||
|
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private Analyzer romajiAnalyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH);
|
||||||
|
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
public void testKatakanaReadings() throws IOException {
|
||||||
|
assertAnalyzesTo(katakanaAnalyzer, "今夜はロバート先生と話した",
|
||||||
|
new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRomajiReadings() throws IOException {
|
||||||
|
assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
|
||||||
|
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomData() throws IOException {
|
||||||
|
checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.kuromoji.KuromojiReadingFormFilter;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link KuromojiReadingFormFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_ja" class="solr.TextField">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.KuromojiTokenizerFactory"/>
|
||||||
|
* <filter class="solr.KuromojiReadingFormFilterFactory"
|
||||||
|
* useRomaji="false"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType>
|
||||||
|
* </pre>
|
||||||
|
*/
|
||||||
|
public class KuromojiReadingFormFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
private static final String ROMAJI_PARAM = "useRomaji";
|
||||||
|
private boolean useRomaji;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String, String> args) {
|
||||||
|
super.init(args);
|
||||||
|
useRomaji = getBoolean(ROMAJI_PARAM, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new KuromojiReadingFormFilter(input, useRomaji);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue