From 8bee41880e41024109bf5729584ebc5dd1003717 Mon Sep 17 00:00:00 2001 From: kuramitsu Date: Fri, 12 Jan 2024 04:33:16 +0900 Subject: [PATCH] Fix for the bug where JapaneseReadingFormFilter cannot convert some hiragana to romaji (#12885) --- lucene/CHANGES.txt | 2 ++ .../ja/JapaneseReadingFormFilter.java | 29 +++++++++++++++++++ .../ja/TestJapaneseReadingFormFilter.java | 10 +++++++ 3 files changed, 41 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 97f61ee850a..46c1a42679f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -222,6 +222,8 @@ Bug Fixes * GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to fail with certain random seeds. (Greg Miller) +* GITHUB#12885: Fixed the bug that JapaneseReadingFormFilter cannot convert some hiragana to romaji. (Takuma Kuramitsu) + Build --------------------- diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java index 046eeb30a31..4b86ec37dc1 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java @@ -43,10 +43,39 @@ public final class JapaneseReadingFormFilter extends TokenFilter { this(input, false); } + private static final char HIRAGANA_START = 0x3041; + private static final char HIRAGANA_END = 0x3096; + + private boolean isHiragana(char ch) { + return ch >= HIRAGANA_START && ch <= HIRAGANA_END; + } + + private boolean containsHiragana(CharSequence s) { + for (int i = 0; i < s.length(); i++) { + if (isHiragana(s.charAt(i))) { + return true; + } + } + return false; + } + @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String reading = readingAttr.getReading(); + if (reading == null && containsHiragana(termAttr)) { + // When a term is OOV and contains hiragana, convert the term to katakana and treat it as + // reading. + int len = termAttr.length(); + char[] readingBuffer = new char[len]; + for (int i = 0; i < len; i++) { + readingBuffer[i] = termAttr.charAt(i); + if (isHiragana(readingBuffer[i])) { + readingBuffer[i] += 0x60; + } + } + reading = new String(readingBuffer); + } if (useRomaji) { if (reading == null) { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java index 70996b53eee..037f4b7edea 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java @@ -88,6 +88,11 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase { a.close(); } + public void testKatakanaReadingsHiragana() throws IOException { + assertAnalyzesTo( + katakanaAnalyzer, "が ぎ ぐ げ ご ぁ ゔ", new String[] {"ガ", "ギ", "グ", "ゲ", "ゴ", "ァ", "ヴ"}); + } + public void testRomajiReadings() throws IOException { assertAnalyzesTo( romajiAnalyzer, @@ -115,6 +120,11 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase { a.close(); } + public void testRomajiReadingsHiragana() throws IOException { + assertAnalyzesTo( + romajiAnalyzer, "が ぎ ぐ げ ご ぁ ゔ", new String[] {"ga", "gi", "gu", "ge", "go", "a", "v"}); + } + public void testRandomData() throws IOException { Random random = random(); checkRandomData(random, katakanaAnalyzer, 200 * RANDOM_MULTIPLIER);