diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 9be7c7eea45..7e462f38744 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -176,6 +176,9 @@ New Features with/without trailing long vowel marks. The filter is used in both KuromojiAnalyzer and the "text_ja" field type in schema.xml. (Christian Moen) + * LUCENE-3915: Add Japanese filter to replace a term attribute with its reading. + (Koji Sekiguchi, Robert Muir, Christian Moen) + * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do joins in both parent to child and child to parent directions. diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java new file mode 100644 index 00000000000..352fdd42bb9 --- /dev/null +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.kuromoji; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute; +import org.apache.lucene.analysis.kuromoji.util.ToStringUtil; + +import java.io.IOException; + +/** + * A {@link org.apache.lucene.analysis.TokenFilter} that replaces the term + * attribute with the reading of a token in either katakana or romaji form. + * The default reading form is katakana. + */ + +public final class KuromojiReadingFormFilter extends TokenFilter { + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class); + + private boolean useRomaji; + + public KuromojiReadingFormFilter(TokenStream input, boolean useRomaji) { + super(input); + this.useRomaji = useRomaji; + } + + public KuromojiReadingFormFilter(TokenStream input) { + this(input, false); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String reading = readingAttr.getReading(); + if (reading != null) { + if (useRomaji) { + ToStringUtil.getRomanization(termAttr.setEmpty(), reading); + } else { + termAttr.setEmpty().append(reading); + } + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java index fb19233e8a9..2db22fdc933 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kuromoji.util; * limitations under the License. */ +import java.io.IOException; import java.util.HashMap; /** @@ -239,7 +240,19 @@ public class ToStringUtil { * Romanize katakana with modified hepburn */ public static String getRomanization(String s) { - StringBuilder builder = new StringBuilder(); + StringBuilder out = new StringBuilder(); + try { + getRomanization(out, s); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + return out.toString(); + } + + /** + * Romanize katakana with modified hepburn + */ + public static void getRomanization(Appendable builder, CharSequence s) throws IOException { final int len = s.length(); for (int i = 0; i < len; i++) { // maximum lookahead: 3 @@ -1022,6 +1035,5 @@ public class ToStringUtil { builder.append(ch); } } - return builder.toString(); } } diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java new file mode 100644 index 00000000000..1f237bb681c --- /dev/null +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java @@ -0,0 +1,64 @@ +package org.apache.lucene.analysis.kuromoji; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Tests for {@link TestKuromojiReadingFormFilter} + */ +public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase { + private Analyzer katakanaAnalyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH); + return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, false)); + } + }; + + private Analyzer romajiAnalyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH); + return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, true)); + } + }; + + + public void testKatakanaReadings() throws IOException { + assertAnalyzesTo(katakanaAnalyzer, "今夜はロバート先生と話した", + new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" } + ); + } + + public void testRomajiReadings() throws IOException { + assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した", + new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" } + ); + } + + public void testRandomData() throws IOException { + checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER); + checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER); + } +} diff --git a/solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java new file mode 100644 index 00000000000..c947937b33f --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java @@ -0,0 +1,50 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.kuromoji.KuromojiReadingFormFilter; + +import java.util.Map; + +/** + * Factory for {@link KuromojiReadingFormFilter}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.KuromojiTokenizerFactory"/>
+ *     <filter class="solr.KuromojiReadingFormFilterFactory"
+ *             useRomaji="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class KuromojiReadingFormFilterFactory extends BaseTokenFilterFactory { + private static final String ROMAJI_PARAM = "useRomaji"; + private boolean useRomaji; + + @Override + public void init(Map args) { + super.init(args); + useRomaji = getBoolean(ROMAJI_PARAM, false); + } + + public TokenStream create(TokenStream input) { + return new KuromojiReadingFormFilter(input, useRomaji); + } +}