mirror of
https://github.com/apache/lucene.git
synced 2025-03-07 00:39:21 +00:00
Fix for the bug where JapaneseReadingFormFilter cannot convert some hiragana to romaji (#12885)
This commit is contained in:
parent
2a851401a1
commit
8bee41880e
@ -222,6 +222,8 @@ Bug Fixes
|
||||
* GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
|
||||
fail with certain random seeds. (Greg Miller)
|
||||
|
||||
* GITHUB#12885: Fixed the bug that JapaneseReadingFormFilter cannot convert some hiragana to romaji. (Takuma Kuramitsu)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
|
@ -43,10 +43,39 @@ public final class JapaneseReadingFormFilter extends TokenFilter {
|
||||
this(input, false);
|
||||
}
|
||||
|
||||
private static final char HIRAGANA_START = 0x3041;
|
||||
private static final char HIRAGANA_END = 0x3096;
|
||||
|
||||
private boolean isHiragana(char ch) {
|
||||
return ch >= HIRAGANA_START && ch <= HIRAGANA_END;
|
||||
}
|
||||
|
||||
private boolean containsHiragana(CharSequence s) {
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
if (isHiragana(s.charAt(i))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
String reading = readingAttr.getReading();
|
||||
if (reading == null && containsHiragana(termAttr)) {
|
||||
// When a term is OOV and contains hiragana, convert the term to katakana and treat it as
|
||||
// reading.
|
||||
int len = termAttr.length();
|
||||
char[] readingBuffer = new char[len];
|
||||
for (int i = 0; i < len; i++) {
|
||||
readingBuffer[i] = termAttr.charAt(i);
|
||||
if (isHiragana(readingBuffer[i])) {
|
||||
readingBuffer[i] += 0x60;
|
||||
}
|
||||
}
|
||||
reading = new String(readingBuffer);
|
||||
}
|
||||
|
||||
if (useRomaji) {
|
||||
if (reading == null) {
|
||||
|
@ -88,6 +88,11 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testKatakanaReadingsHiragana() throws IOException {
|
||||
assertAnalyzesTo(
|
||||
katakanaAnalyzer, "が ぎ ぐ げ ご ぁ ゔ", new String[] {"ガ", "ギ", "グ", "ゲ", "ゴ", "ァ", "ヴ"});
|
||||
}
|
||||
|
||||
public void testRomajiReadings() throws IOException {
|
||||
assertAnalyzesTo(
|
||||
romajiAnalyzer,
|
||||
@ -115,6 +120,11 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testRomajiReadingsHiragana() throws IOException {
|
||||
assertAnalyzesTo(
|
||||
romajiAnalyzer, "が ぎ ぐ げ ご ぁ ゔ", new String[] {"ga", "gi", "gu", "ge", "go", "a", "v"});
|
||||
}
|
||||
|
||||
public void testRandomData() throws IOException {
|
||||
Random random = random();
|
||||
checkRandomData(random, katakanaAnalyzer, 200 * RANDOM_MULTIPLIER);
|
||||
|
Loading…
x
Reference in New Issue
Block a user