diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 2d98bc55d77..c2efeede89f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -194,6 +194,9 @@ New Features
* GITHUB#13125: Recursive graph bisection is now supported on indexes that have blocks, as long as
they configure a parent field via `IndexWriterConfig#setParentField`. (Adrien Grand)
+* GITHUB#12915: Add new token filters for Japanese sutegana (捨て仮名). This introduces JapaneseHiraganaUppercaseFilter
+ and JapaneseKatakanaUppercaseFilter. (Dai Sugimori)
+
Improvements
---------------------
@@ -258,7 +261,6 @@ API Changes
New Features
---------------------
-
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
VectorSimilarityCollector to find all vectors scoring above a `resultSimilarity` while traversing the HNSW graph till
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
diff --git a/lucene/analysis/kuromoji/src/java/module-info.java b/lucene/analysis/kuromoji/src/java/module-info.java
index f2040f628a1..e849ed644bc 100644
--- a/lucene/analysis/kuromoji/src/java/module-info.java
+++ b/lucene/analysis/kuromoji/src/java/module-info.java
@@ -40,5 +40,7 @@ module org.apache.lucene.analysis.kuromoji {
org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory,
org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory,
org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory,
- org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory;
+ org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory,
+ org.apache.lucene.analysis.ja.JapaneseHiraganaUppercaseFilterFactory,
+ org.apache.lucene.analysis.ja.JapaneseKatakanaUppercaseFilterFactory;
}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java
new file mode 100644
index 00000000000..b5078b73607
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that normalizes small letters (捨て仮名) in hiragana into normal letters. For
+ * instance, "ちょっとまって" will be translated to "ちよつとまつて".
+ *
+ *
This filter is useful if you want to search against old style Japanese text such as patents,
+ * legal, contract policies, etc.
+ */
+public final class JapaneseHiraganaUppercaseFilter extends TokenFilter {
+ private static final Map LETTER_MAPPINGS;
+
+ static {
+ // supported characters are:
+ // ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ゎ ゕ ゖ
+ LETTER_MAPPINGS =
+ Map.ofEntries(
+ Map.entry('ぁ', 'あ'),
+ Map.entry('ぃ', 'い'),
+ Map.entry('ぅ', 'う'),
+ Map.entry('ぇ', 'え'),
+ Map.entry('ぉ', 'お'),
+ Map.entry('っ', 'つ'),
+ Map.entry('ゃ', 'や'),
+ Map.entry('ゅ', 'ゆ'),
+ Map.entry('ょ', 'よ'),
+ Map.entry('ゎ', 'わ'),
+ Map.entry('ゕ', 'か'),
+ Map.entry('ゖ', 'け'));
+ }
+
+ private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+
+ public JapaneseHiraganaUppercaseFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] termBuffer = termAttr.buffer();
+ for (int i = 0; i < termBuffer.length; i++) {
+ Character c = LETTER_MAPPINGS.get(termBuffer[i]);
+ if (c != null) {
+ termBuffer[i] = c;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java
new file mode 100644
index 00000000000..872d6df406f
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link JapaneseHiraganaUppercaseFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class JapaneseHiraganaUppercaseFilterFactory extends TokenFilterFactory {
+
+ /** SPI name */
+ public static final String NAME = "japaneseHiraganaUppercase";
+
+ public JapaneseHiraganaUppercaseFilterFactory(Map args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /** Default ctor for compatibility with SPI */
+ public JapaneseHiraganaUppercaseFilterFactory() {
+ throw defaultCtorException();
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new JapaneseHiraganaUppercaseFilter(input);
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java
new file mode 100644
index 00000000000..5e05714d1c3
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that normalizes small letters (捨て仮名) in katakana into normal letters. For
+ * instance, "ストップウォッチ" will be translated to "ストツプウオツチ".
+ *
+ * This filter is useful if you want to search against old style Japanese text such as patents,
+ * legal, contract policies, etc.
+ */
+public final class JapaneseKatakanaUppercaseFilter extends TokenFilter {
+ private static final Map LETTER_MAPPINGS;
+
+ static {
+ // supported characters are:
+ // ァ ィ ゥ ェ ォ ヵ ㇰ ヶ ㇱ ㇲ ッ ㇳ ㇴ ㇵ ㇶ ㇷ ㇷ゚ ㇸ ㇹ ㇺ ャ ュ ョ ㇻ ㇼ ㇽ ㇾ ㇿ ヮ
+ LETTER_MAPPINGS =
+ Map.ofEntries(
+ Map.entry('ァ', 'ア'),
+ Map.entry('ィ', 'イ'),
+ Map.entry('ゥ', 'ウ'),
+ Map.entry('ェ', 'エ'),
+ Map.entry('ォ', 'オ'),
+ Map.entry('ヵ', 'カ'),
+ Map.entry('ㇰ', 'ク'),
+ Map.entry('ヶ', 'ケ'),
+ Map.entry('ㇱ', 'シ'),
+ Map.entry('ㇲ', 'ス'),
+ Map.entry('ッ', 'ツ'),
+ Map.entry('ㇳ', 'ト'),
+ Map.entry('ㇴ', 'ヌ'),
+ Map.entry('ㇵ', 'ハ'),
+ Map.entry('ㇶ', 'ヒ'),
+ Map.entry('ㇷ', 'フ'),
+ Map.entry('ㇸ', 'ヘ'),
+ Map.entry('ㇹ', 'ホ'),
+ Map.entry('ㇺ', 'ム'),
+ Map.entry('ャ', 'ヤ'),
+ Map.entry('ュ', 'ユ'),
+ Map.entry('ョ', 'ヨ'),
+ Map.entry('ㇻ', 'ラ'),
+ Map.entry('ㇼ', 'リ'),
+ Map.entry('ㇽ', 'ル'),
+ Map.entry('ㇾ', 'レ'),
+ Map.entry('ㇿ', 'ロ'),
+ Map.entry('ヮ', 'ワ'));
+ }
+
+ private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+
+ public JapaneseKatakanaUppercaseFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAttr.toString();
+ if (term.contains("ㇷ゚")) {
+ term = term.replace("ㇷ゚", "プ");
+ termAttr.setEmpty().append(term);
+ }
+ char[] termBuffer = termAttr.buffer();
+ for (int i = 0; i < termBuffer.length; i++) {
+ Character c = LETTER_MAPPINGS.get(termBuffer[i]);
+ if (c != null) {
+ termBuffer[i] = c;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java
new file mode 100644
index 00000000000..a6e4e8943ea
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link JapaneseKatakanaUppercaseFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class JapaneseKatakanaUppercaseFilterFactory extends TokenFilterFactory {
+
+ /** SPI name */
+ public static final String NAME = "japaneseKatakanaUppercase";
+
+ public JapaneseKatakanaUppercaseFilterFactory(Map args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /** Default ctor for compatibility with SPI */
+ public JapaneseKatakanaUppercaseFilterFactory() {
+ throw defaultCtorException();
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new JapaneseKatakanaUppercaseFilter(input);
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index a5cfe7ba4bb..c3c7cac02d0 100644
--- a/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -19,3 +19,5 @@ org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory
+org.apache.lucene.analysis.ja.JapaneseHiraganaUppercaseFilterFactory
+org.apache.lucene.analysis.ja.JapaneseKatakanaUppercaseFilterFactory
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java
new file mode 100644
index 00000000000..7fb0c7d7972
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+
+/** Tests for {@link JapaneseHiraganaUppercaseFilter} */
+public class TestJapaneseHiraganaUppercaseFilter extends BaseTokenStreamTestCase {
+ private Analyzer keywordAnalyzer, japaneseAnalyzer;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ keywordAnalyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(
+ tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer));
+ }
+ };
+ japaneseAnalyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer =
+ new JapaneseTokenizer(
+ newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
+ return new TokenStreamComponents(
+ tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer));
+ }
+ };
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ keywordAnalyzer.close();
+ japaneseAnalyzer.close();
+ super.tearDown();
+ }
+
+ public void testKanaUppercase() throws IOException {
+ assertAnalyzesTo(keywordAnalyzer, "ぁぃぅぇぉっゃゅょゎゕゖ", new String[] {"あいうえおつやゆよわかけ"});
+ assertAnalyzesTo(keywordAnalyzer, "ちょっとまって", new String[] {"ちよつとまつて"});
+ }
+
+ public void testKanaUppercaseWithSurrogatePair() throws IOException {
+ // 𠀋 : \uD840\uDC0B
+ assertAnalyzesTo(
+ keywordAnalyzer,
+ "\uD840\uDC0Bちょっとまって ちょっと\uD840\uDC0Bまって ちょっとまって\uD840\uDC0B",
+ new String[] {"\uD840\uDC0Bちよつとまつて", "ちよつと\uD840\uDC0Bまつて", "ちよつとまつて\uD840\uDC0B"});
+ }
+
+ public void testKanaUppercaseWithJapaneseTokenizer() throws IOException {
+ assertAnalyzesTo(japaneseAnalyzer, "ちょっとまって", new String[] {"ちよつと", "まつ", "て"});
+ }
+
+ public void testRandomData() throws IOException {
+ checkRandomData(random(), keywordAnalyzer, 200 * RANDOM_MULTIPLIER);
+ }
+
+ public void testEmptyTerm() throws IOException {
+ assertAnalyzesTo(keywordAnalyzer, "", new String[] {});
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilterFactory.java
new file mode 100644
index 00000000000..203d7eab5e6
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilterFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
+
+/** Tests for {@link JapaneseHiraganaUppercaseFilterFactory} */
+public class TestJapaneseHiraganaUppercaseFilterFactory extends BaseTokenStreamTestCase {
+ public void testBasics() throws IOException {
+
+ Map args = new HashMap<>();
+ args.put("discardPunctuation", "false");
+
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(args);
+
+ tokenizerFactory.inform(new StringMockResourceLoader(""));
+ TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
+ ((Tokenizer) tokenStream).setReader(new StringReader("ちょっとまって"));
+
+ JapaneseHiraganaUppercaseFilterFactory factory =
+ new JapaneseHiraganaUppercaseFilterFactory(new HashMap<>());
+ tokenStream = factory.create(tokenStream);
+ assertTokenStreamContents(tokenStream, new String[] {"ちよつと", "まつ", "て"});
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ IllegalArgumentException expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () ->
+ new JapaneseHiraganaUppercaseFilterFactory(
+ new HashMap<>() {
+ {
+ put("bogusArg", "bogusValue");
+ }
+ }));
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java
new file mode 100644
index 00000000000..30039305797
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+
+/** Tests for {@link JapaneseKatakanaUppercaseFilter} */
+public class TestJapaneseKatakanaUppercaseFilter extends BaseTokenStreamTestCase {
+ private Analyzer keywordAnalyzer, japaneseAnalyzer;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ keywordAnalyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(
+ tokenizer, new JapaneseKatakanaUppercaseFilter(tokenizer));
+ }
+ };
+ japaneseAnalyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer =
+ new JapaneseTokenizer(
+ newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
+ return new TokenStreamComponents(
+ tokenizer, new JapaneseKatakanaUppercaseFilter(tokenizer));
+ }
+ };
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ keywordAnalyzer.close();
+ japaneseAnalyzer.close();
+ super.tearDown();
+ }
+
+ public void testKanaUppercase() throws IOException {
+ assertAnalyzesTo(
+ keywordAnalyzer,
+ "ァィゥェォヵㇰヶㇱㇲッㇳㇴㇵㇶㇷㇷ゚ㇸㇹㇺャュョㇻㇼㇽㇾㇿヮ",
+ new String[] {"アイウエオカクケシスツトヌハヒフプヘホムヤユヨラリルレロワ"});
+ assertAnalyzesTo(keywordAnalyzer, "ストップウォッチ", new String[] {"ストツプウオツチ"});
+ assertAnalyzesTo(keywordAnalyzer, "サラニㇷ゚ カムイチェㇷ゚ ㇷ゚ㇷ゚", new String[] {"サラニプ", "カムイチエプ", "ププ"});
+ }
+
+ public void testKanaUppercaseWithSurrogatePair() throws IOException {
+ // 𠀋 : \uD840\uDC0B
+ assertAnalyzesTo(
+ keywordAnalyzer,
+ "\uD840\uDC0Bストップウォッチ ストップ\uD840\uDC0Bウォッチ ストップウォッチ\uD840\uDC0B",
+ new String[] {"\uD840\uDC0Bストツプウオツチ", "ストツプ\uD840\uDC0Bウオツチ", "ストツプウオツチ\uD840\uDC0B"});
+ }
+
+ public void testKanaUppercaseWithJapaneseTokenizer() throws IOException {
+ assertAnalyzesTo(
+ japaneseAnalyzer, "時間をストップウォッチで測る", new String[] {"時間", "を", "ストツプウオツチ", "で", "測る"});
+ }
+
+ public void testUnsupportedHalfWidthVariants() throws IOException {
+ // The below result is expected since only full-width katakana is supported
+ assertAnalyzesTo(keywordAnalyzer, "ストップウォッチ", new String[] {"ストップウォッチ"});
+ }
+
+ public void testRandomData() throws IOException {
+ checkRandomData(random(), keywordAnalyzer, 200 * RANDOM_MULTIPLIER);
+ }
+
+ public void testEmptyTerm() throws IOException {
+ assertAnalyzesTo(keywordAnalyzer, "", new String[] {});
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilterFactory.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilterFactory.java
new file mode 100644
index 00000000000..d6cbd1819f4
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilterFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.util.StringMockResourceLoader;
+
+/** Tests for {@link JapaneseKatakanaUppercaseFilterFactory} */
+public class TestJapaneseKatakanaUppercaseFilterFactory extends BaseTokenStreamTestCase {
+ public void testBasics() throws IOException {
+
+ Map args = new HashMap<>();
+ args.put("discardPunctuation", "false");
+
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(args);
+
+ tokenizerFactory.inform(new StringMockResourceLoader(""));
+ TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
+ ((Tokenizer) tokenStream).setReader(new StringReader("ストップウォッチ"));
+
+ JapaneseKatakanaUppercaseFilterFactory factory =
+ new JapaneseKatakanaUppercaseFilterFactory(new HashMap<>());
+ tokenStream = factory.create(tokenStream);
+ assertTokenStreamContents(tokenStream, new String[] {"ストツプウオツチ"});
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ IllegalArgumentException expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () ->
+ new JapaneseKatakanaUppercaseFilterFactory(
+ new HashMap<>() {
+ {
+ put("bogusArg", "bogusValue");
+ }
+ }));
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+}