mirror of https://github.com/apache/lucene.git
Added KuromojiReadingFormFilter (LUCENE-3915)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1305046 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fb33754168
commit
c3ddb9dc67
|
@ -176,6 +176,9 @@ New Features
|
|||
with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer
|
||||
and the "text_ja" field type in schema.xml. (Christian Moen)
|
||||
|
||||
* LUCENE-3915: Add Japanese filter to replace a term attribute with its reading.
|
||||
(Koji Sekiguchi, Robert Muir, Christian Moen)
|
||||
|
||||
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
|
||||
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
|
||||
joins in both parent to child and child to parent directions.
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.analysis.kuromoji;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
|
||||
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A {@link org.apache.lucene.analysis.TokenFilter} that replaces the term
|
||||
* attribute with the reading of a token in either katakana or romaji form.
|
||||
* The default reading form is katakana.
|
||||
*/
|
||||
|
||||
public final class KuromojiReadingFormFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
||||
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
|
||||
|
||||
private boolean useRomaji;
|
||||
|
||||
public KuromojiReadingFormFilter(TokenStream input, boolean useRomaji) {
|
||||
super(input);
|
||||
this.useRomaji = useRomaji;
|
||||
}
|
||||
|
||||
public KuromojiReadingFormFilter(TokenStream input) {
|
||||
this(input, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
String reading = readingAttr.getReading();
|
||||
if (reading != null) {
|
||||
if (useRomaji) {
|
||||
ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
|
||||
} else {
|
||||
termAttr.setEmpty().append(reading);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kuromoji.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
|
@ -239,7 +240,19 @@ public class ToStringUtil {
|
|||
* Romanize katakana with modified hepburn
|
||||
*/
|
||||
public static String getRomanization(String s) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
StringBuilder out = new StringBuilder();
|
||||
try {
|
||||
getRomanization(out, s);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Romanize katakana with modified hepburn
|
||||
*/
|
||||
public static void getRomanization(Appendable builder, CharSequence s) throws IOException {
|
||||
final int len = s.length();
|
||||
for (int i = 0; i < len; i++) {
|
||||
// maximum lookahead: 3
|
||||
|
@ -1022,6 +1035,5 @@ public class ToStringUtil {
|
|||
builder.append(ch);
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.analysis.kuromoji;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Tests for {@link TestKuromojiReadingFormFilter}
|
||||
*/
|
||||
public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer katakanaAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, false));
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer romajiAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, true));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public void testKatakanaReadings() throws IOException {
|
||||
assertAnalyzesTo(katakanaAnalyzer, "今夜はロバート先生と話した",
|
||||
new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testRomajiReadings() throws IOException {
|
||||
assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
|
||||
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testRandomData() throws IOException {
|
||||
checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiReadingFormFilter;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link KuromojiReadingFormFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ja" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.KuromojiTokenizerFactory"/>
|
||||
* <filter class="solr.KuromojiReadingFormFilterFactory"
|
||||
* useRomaji="false"/>
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
*/
|
||||
public class KuromojiReadingFormFilterFactory extends BaseTokenFilterFactory {
|
||||
private static final String ROMAJI_PARAM = "useRomaji";
|
||||
private boolean useRomaji;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
useRomaji = getBoolean(ROMAJI_PARAM, false);
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new KuromojiReadingFormFilter(input, useRomaji);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue