Fix for the bug where JapaneseReadingFormFilter cannot convert some hiragana to romaji (#12885)

This commit is contained in:
kuramitsu 2024-01-12 04:33:16 +09:00 committed by GitHub
parent 2a851401a1
commit 8bee41880e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 41 additions and 0 deletions

View File

@ -222,6 +222,8 @@ Bug Fixes
* GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
fail with certain random seeds. (Greg Miller)
* GITHUB#12885: Fixed the bug that JapaneseReadingFormFilter cannot convert some hiragana to romaji. (Takuma Kuramitsu)
Build
---------------------

View File

@ -43,10 +43,39 @@ public final class JapaneseReadingFormFilter extends TokenFilter {
this(input, false);
}
private static final char HIRAGANA_START = 0x3041;
private static final char HIRAGANA_END = 0x3096;
private boolean isHiragana(char ch) {
return ch >= HIRAGANA_START && ch <= HIRAGANA_END;
}
private boolean containsHiragana(CharSequence s) {
for (int i = 0; i < s.length(); i++) {
if (isHiragana(s.charAt(i))) {
return true;
}
}
return false;
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String reading = readingAttr.getReading();
if (reading == null && containsHiragana(termAttr)) {
// When a term is OOV and contains hiragana, convert the term to katakana and treat it as
// reading.
int len = termAttr.length();
char[] readingBuffer = new char[len];
for (int i = 0; i < len; i++) {
readingBuffer[i] = termAttr.charAt(i);
if (isHiragana(readingBuffer[i])) {
readingBuffer[i] += 0x60;
}
}
reading = new String(readingBuffer);
}
if (useRomaji) {
if (reading == null) {

View File

@ -88,6 +88,11 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
a.close();
}
public void testKatakanaReadingsHiragana() throws IOException {
assertAnalyzesTo(
katakanaAnalyzer, "が ぎ ぐ げ ご ぁ ゔ", new String[] {"", "", "", "", "", "", ""});
}
public void testRomajiReadings() throws IOException {
assertAnalyzesTo(
romajiAnalyzer,
@ -115,6 +120,11 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
a.close();
}
public void testRomajiReadingsHiragana() throws IOException {
assertAnalyzesTo(
romajiAnalyzer, "が ぎ ぐ げ ご ぁ ゔ", new String[] {"ga", "gi", "gu", "ge", "go", "a", "v"});
}
public void testRandomData() throws IOException {
Random random = random();
checkRandomData(random, katakanaAnalyzer, 200 * RANDOM_MULTIPLIER);