diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2d98bc55d77..c2efeede89f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -194,6 +194,9 @@ New Features * GITHUB#13125: Recursive graph bisection is now supported on indexes that have blocks, as long as they configure a parent field via `IndexWriterConfig#setParentField`. (Adrien Grand) +* GITHUB#12915: Add new token filters for Japanese sutegana (捨て仮名). This introduces JapaneseHiraganaUppercaseFilter + and JapaneseKatakanaUppercaseFilter. (Dai Sugimori) + Improvements --------------------- @@ -258,7 +261,6 @@ API Changes New Features --------------------- - * GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new VectorSimilarityCollector to find all vectors scoring above a `resultSimilarity` while traversing the HNSW graph till better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest diff --git a/lucene/analysis/kuromoji/src/java/module-info.java b/lucene/analysis/kuromoji/src/java/module-info.java index f2040f628a1..e849ed644bc 100644 --- a/lucene/analysis/kuromoji/src/java/module-info.java +++ b/lucene/analysis/kuromoji/src/java/module-info.java @@ -40,5 +40,7 @@ module org.apache.lucene.analysis.kuromoji { org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory, org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory, org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory, - org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory; + org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory, + org.apache.lucene.analysis.ja.JapaneseHiraganaUppercaseFilterFactory, + org.apache.lucene.analysis.ja.JapaneseKatakanaUppercaseFilterFactory; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java new file mode 100644 index 00000000000..b5078b73607 --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import java.util.Map; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * A {@link TokenFilter} that normalizes small letters (捨て仮名) in hiragana into normal letters. For + * instance, "ちょっとまって" will be translated to "ちよつとまつて". + * + *

This filter is useful if you want to search against old style Japanese text such as patents, + * legal, contract policies, etc. + */ +public final class JapaneseHiraganaUppercaseFilter extends TokenFilter { + private static final Map LETTER_MAPPINGS; + + static { + // supported characters are: + // ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ゎ ゕ ゖ + LETTER_MAPPINGS = + Map.ofEntries( + Map.entry('ぁ', 'あ'), + Map.entry('ぃ', 'い'), + Map.entry('ぅ', 'う'), + Map.entry('ぇ', 'え'), + Map.entry('ぉ', 'お'), + Map.entry('っ', 'つ'), + Map.entry('ゃ', 'や'), + Map.entry('ゅ', 'ゆ'), + Map.entry('ょ', 'よ'), + Map.entry('ゎ', 'わ'), + Map.entry('ゕ', 'か'), + Map.entry('ゖ', 'け')); + } + + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + + public JapaneseHiraganaUppercaseFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAttr.buffer(); + for (int i = 0; i < termBuffer.length; i++) { + Character c = LETTER_MAPPINGS.get(termBuffer[i]); + if (c != null) { + termBuffer[i] = c; + } + } + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java new file mode 100644 index 00000000000..872d6df406f --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.util.Map; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link JapaneseHiraganaUppercaseFilter}. + * + * @lucene.spi {@value #NAME} + */ +public class JapaneseHiraganaUppercaseFilterFactory extends TokenFilterFactory { + + /** SPI name */ + public static final String NAME = "japaneseHiraganaUppercase"; + + public JapaneseHiraganaUppercaseFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + /** Default ctor for compatibility with SPI */ + public JapaneseHiraganaUppercaseFilterFactory() { + throw defaultCtorException(); + } + + @Override + public TokenStream create(TokenStream input) { + return new JapaneseHiraganaUppercaseFilter(input); + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java new file mode 100644 index 00000000000..5e05714d1c3 --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import java.util.Map; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * A {@link TokenFilter} that normalizes small letters (捨て仮名) in katakana into normal letters. For + * instance, "ストップウォッチ" will be translated to "ストツプウオツチ". + * + *

This filter is useful if you want to search against old style Japanese text such as patents, + * legal, contract policies, etc. + */ +public final class JapaneseKatakanaUppercaseFilter extends TokenFilter { + private static final Map LETTER_MAPPINGS; + + static { + // supported characters are: + // ァ ィ ゥ ェ ォ ヵ ㇰ ヶ ㇱ ㇲ ッ ㇳ ㇴ ㇵ ㇶ ㇷ ㇷ゚ ㇸ ㇹ ㇺ ャ ュ ョ ㇻ ㇼ ㇽ ㇾ ㇿ ヮ + LETTER_MAPPINGS = + Map.ofEntries( + Map.entry('ァ', 'ア'), + Map.entry('ィ', 'イ'), + Map.entry('ゥ', 'ウ'), + Map.entry('ェ', 'エ'), + Map.entry('ォ', 'オ'), + Map.entry('ヵ', 'カ'), + Map.entry('ㇰ', 'ク'), + Map.entry('ヶ', 'ケ'), + Map.entry('ㇱ', 'シ'), + Map.entry('ㇲ', 'ス'), + Map.entry('ッ', 'ツ'), + Map.entry('ㇳ', 'ト'), + Map.entry('ㇴ', 'ヌ'), + Map.entry('ㇵ', 'ハ'), + Map.entry('ㇶ', 'ヒ'), + Map.entry('ㇷ', 'フ'), + Map.entry('ㇸ', 'ヘ'), + Map.entry('ㇹ', 'ホ'), + Map.entry('ㇺ', 'ム'), + Map.entry('ャ', 'ヤ'), + Map.entry('ュ', 'ユ'), + Map.entry('ョ', 'ヨ'), + Map.entry('ㇻ', 'ラ'), + Map.entry('ㇼ', 'リ'), + Map.entry('ㇽ', 'ル'), + Map.entry('ㇾ', 'レ'), + Map.entry('ㇿ', 'ロ'), + Map.entry('ヮ', 'ワ')); + } + + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + + public JapaneseKatakanaUppercaseFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAttr.toString(); + if (term.contains("ㇷ゚")) { + term = term.replace("ㇷ゚", "プ"); + termAttr.setEmpty().append(term); + } + char[] termBuffer = termAttr.buffer(); + for (int i = 0; i < termBuffer.length; i++) { + Character c = LETTER_MAPPINGS.get(termBuffer[i]); + if (c != null) { + termBuffer[i] = c; + } + } + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java new file mode 100644 index 00000000000..a6e4e8943ea --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.util.Map; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link JapaneseKatakanaUppercaseFilter}. + * + * @lucene.spi {@value #NAME} + */ +public class JapaneseKatakanaUppercaseFilterFactory extends TokenFilterFactory { + + /** SPI name */ + public static final String NAME = "japaneseKatakanaUppercase"; + + public JapaneseKatakanaUppercaseFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + /** Default ctor for compatibility with SPI */ + public JapaneseKatakanaUppercaseFilterFactory() { + throw defaultCtorException(); + } + + @Override + public TokenStream create(TokenStream input) { + return new JapaneseKatakanaUppercaseFilter(input); + } +} diff --git a/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory index a5cfe7ba4bb..c3c7cac02d0 100644 --- a/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory +++ b/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory @@ -19,3 +19,5 @@ org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory +org.apache.lucene.analysis.ja.JapaneseHiraganaUppercaseFilterFactory +org.apache.lucene.analysis.ja.JapaneseKatakanaUppercaseFilterFactory diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java new file mode 100644 index 00000000000..7fb0c7d7972 --- /dev/null +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.analysis.MockTokenizer; + +/** Tests for {@link JapaneseHiraganaUppercaseFilter} */ +public class TestJapaneseHiraganaUppercaseFilter extends BaseTokenStreamTestCase { + private Analyzer keywordAnalyzer, japaneseAnalyzer; + + @Override + public void setUp() throws Exception { + super.setUp(); + keywordAnalyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents( + tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer)); + } + }; + japaneseAnalyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = + new JapaneseTokenizer( + newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH); + return new TokenStreamComponents( + tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer)); + } + }; + } + + @Override + public void tearDown() throws Exception { + keywordAnalyzer.close(); + japaneseAnalyzer.close(); + super.tearDown(); + } + + public void testKanaUppercase() throws IOException { + assertAnalyzesTo(keywordAnalyzer, "ぁぃぅぇぉっゃゅょゎゕゖ", new String[] {"あいうえおつやゆよわかけ"}); + assertAnalyzesTo(keywordAnalyzer, "ちょっとまって", new String[] {"ちよつとまつて"}); + } + + public void testKanaUppercaseWithSurrogatePair() throws IOException { + // 𠀋 : \uD840\uDC0B + assertAnalyzesTo( + keywordAnalyzer, + "\uD840\uDC0Bちょっとまって ちょっと\uD840\uDC0Bまって ちょっとまって\uD840\uDC0B", + new String[] {"\uD840\uDC0Bちよつとまつて", "ちよつと\uD840\uDC0Bまつて", "ちよつとまつて\uD840\uDC0B"}); + } + + public void testKanaUppercaseWithJapaneseTokenizer() throws IOException { + assertAnalyzesTo(japaneseAnalyzer, "ちょっとまって", new String[] {"ちよつと", "まつ", "て"}); + } + + public void testRandomData() throws IOException { + checkRandomData(random(), keywordAnalyzer, 200 * RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + assertAnalyzesTo(keywordAnalyzer, "", new String[] {}); + } +} diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilterFactory.java new file mode 100644 index 00000000000..203d7eab5e6 --- /dev/null +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilterFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; + +/** Tests for {@link JapaneseHiraganaUppercaseFilterFactory} */ +public class TestJapaneseHiraganaUppercaseFilterFactory extends BaseTokenStreamTestCase { + public void testBasics() throws IOException { + + Map args = new HashMap<>(); + args.put("discardPunctuation", "false"); + + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(args); + + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory()); + ((Tokenizer) tokenStream).setReader(new StringReader("ちょっとまって")); + + JapaneseHiraganaUppercaseFilterFactory factory = + new JapaneseHiraganaUppercaseFilterFactory(new HashMap<>()); + tokenStream = factory.create(tokenStream); + assertTokenStreamContents(tokenStream, new String[] {"ちよつと", "まつ", "て"}); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = + expectThrows( + IllegalArgumentException.class, + () -> + new JapaneseHiraganaUppercaseFilterFactory( + new HashMap<>() { + { + put("bogusArg", "bogusValue"); + } + })); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +} diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java new file mode 100644 index 00000000000..30039305797 --- /dev/null +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.analysis.MockTokenizer; + +/** Tests for {@link JapaneseKatakanaUppercaseFilter} */ +public class TestJapaneseKatakanaUppercaseFilter extends BaseTokenStreamTestCase { + private Analyzer keywordAnalyzer, japaneseAnalyzer; + + @Override + public void setUp() throws Exception { + super.setUp(); + keywordAnalyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents( + tokenizer, new JapaneseKatakanaUppercaseFilter(tokenizer)); + } + }; + japaneseAnalyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = + new JapaneseTokenizer( + newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH); + return new TokenStreamComponents( + tokenizer, new JapaneseKatakanaUppercaseFilter(tokenizer)); + } + }; + } + + @Override + public void tearDown() throws Exception { + keywordAnalyzer.close(); + japaneseAnalyzer.close(); + super.tearDown(); + } + + public void testKanaUppercase() throws IOException { + assertAnalyzesTo( + keywordAnalyzer, + "ァィゥェォヵㇰヶㇱㇲッㇳㇴㇵㇶㇷㇷ゚ㇸㇹㇺャュョㇻㇼㇽㇾㇿヮ", + new String[] {"アイウエオカクケシスツトヌハヒフプヘホムヤユヨラリルレロワ"}); + assertAnalyzesTo(keywordAnalyzer, "ストップウォッチ", new String[] {"ストツプウオツチ"}); + assertAnalyzesTo(keywordAnalyzer, "サラニㇷ゚ カムイチェㇷ゚ ㇷ゚ㇷ゚", new String[] {"サラニプ", "カムイチエプ", "ププ"}); + } + + public void testKanaUppercaseWithSurrogatePair() throws IOException { + // 𠀋 : \uD840\uDC0B + assertAnalyzesTo( + keywordAnalyzer, + "\uD840\uDC0Bストップウォッチ ストップ\uD840\uDC0Bウォッチ ストップウォッチ\uD840\uDC0B", + new String[] {"\uD840\uDC0Bストツプウオツチ", "ストツプ\uD840\uDC0Bウオツチ", "ストツプウオツチ\uD840\uDC0B"}); + } + + public void testKanaUppercaseWithJapaneseTokenizer() throws IOException { + assertAnalyzesTo( + japaneseAnalyzer, "時間をストップウォッチで測る", new String[] {"時間", "を", "ストツプウオツチ", "で", "測る"}); + } + + public void testUnsupportedHalfWidthVariants() throws IOException { + // The below result is expected since only full-width katakana is supported + assertAnalyzesTo(keywordAnalyzer, "ストップウォッチ", new String[] {"ストップウォッチ"}); + } + + public void testRandomData() throws IOException { + checkRandomData(random(), keywordAnalyzer, 200 * RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + assertAnalyzesTo(keywordAnalyzer, "", new String[] {}); + } +} diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilterFactory.java new file mode 100644 index 00000000000..d6cbd1819f4 --- /dev/null +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilterFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.tests.util.StringMockResourceLoader; + +/** Tests for {@link JapaneseKatakanaUppercaseFilterFactory} */ +public class TestJapaneseKatakanaUppercaseFilterFactory extends BaseTokenStreamTestCase { + public void testBasics() throws IOException { + + Map args = new HashMap<>(); + args.put("discardPunctuation", "false"); + + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(args); + + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory()); + ((Tokenizer) tokenStream).setReader(new StringReader("ストップウォッチ")); + + JapaneseKatakanaUppercaseFilterFactory factory = + new JapaneseKatakanaUppercaseFilterFactory(new HashMap<>()); + tokenStream = factory.create(tokenStream); + assertTokenStreamContents(tokenStream, new String[] {"ストツプウオツチ"}); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = + expectThrows( + IllegalArgumentException.class, + () -> + new JapaneseKatakanaUppercaseFilterFactory( + new HashMap<>() { + { + put("bogusArg", "bogusValue"); + } + })); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +}