mirror of https://github.com/apache/lucene.git
LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and
detects script boundaries more accurately with Character#UnicodeScript#of. Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
parent
b6b9f9554e
commit
643ffc6f9f
|
@ -214,6 +214,12 @@ Build
|
|||
|
||||
* LUCENE-8537: ant test command fails under lucene/tools (Peter Somogyi)
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and
|
||||
detects script boundaries more accurately with Character#UnicodeScript#of.
|
||||
(Christophe Bismuth, Jim Ferenczi)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
|
||||
|
|
|
@ -43,6 +43,8 @@ import org.apache.lucene.util.IntsRef;
|
|||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
import static java.lang.Character.UnicodeScript;
|
||||
|
||||
/**
|
||||
* Tokenizer for Korean that uses morphological analysis.
|
||||
* <p>
|
||||
|
@ -718,27 +720,42 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
|
||||
|
||||
// Find unknown match:
|
||||
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
|
||||
final boolean isPunct = isPunctuation(firstCharacter);
|
||||
|
||||
int characterId = characterDefinition.getCharacterClass(firstCharacter);
|
||||
// NOTE: copied from UnknownDictionary.lookup:
|
||||
int unknownWordLength;
|
||||
if (!characterDefinition.isGroup(firstCharacter)) {
|
||||
unknownWordLength = 1;
|
||||
} else {
|
||||
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
|
||||
// Extract unknown word. Characters with the same script are considered to be part of unknown word
|
||||
unknownWordLength = 1;
|
||||
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
|
||||
final boolean isPunct = isPunctuation(firstCharacter);
|
||||
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
||||
final int ch = buffer.get(posAhead);
|
||||
if (ch == -1) {
|
||||
int next = buffer.get(posAhead);
|
||||
if (next == -1) {
|
||||
break;
|
||||
}
|
||||
if (characterId == characterDefinition.getCharacterClass((char) ch) &&
|
||||
isPunctuation((char) ch) == isPunct) {
|
||||
char ch = (char) next;
|
||||
int chType = Character.getType(ch);
|
||||
UnicodeScript sc = UnicodeScript.of(next);
|
||||
boolean sameScript = isSameScript(scriptCode, sc)
|
||||
// Non-spacing marks inherit the script of their base character,
|
||||
// following recommendations from UTR #24.
|
||||
|| chType == Character.NON_SPACING_MARK;
|
||||
|
||||
if (sameScript
|
||||
&& isPunctuation(ch, chType) == isPunct
|
||||
&& characterDefinition.isGroup(ch)) {
|
||||
unknownWordLength++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
// Update the script code and character class if the original script
|
||||
// is Inherited or Common.
|
||||
if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
|
||||
scriptCode = sc;
|
||||
characterId = characterDefinition.getCharacterClass(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -932,11 +949,15 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private static boolean isPunctuation(char ch) {
|
||||
return isPunctuation(ch, Character.getType(ch));
|
||||
}
|
||||
|
||||
private static boolean isPunctuation(char ch, int cid) {
|
||||
// special case for Hangul Letter Araea (interpunct)
|
||||
if (ch == 0x318D) {
|
||||
return true;
|
||||
}
|
||||
switch(Character.getType(ch)) {
|
||||
switch(cid) {
|
||||
case Character.SPACE_SEPARATOR:
|
||||
case Character.LINE_SEPARATOR:
|
||||
case Character.PARAGRAPH_SEPARATOR:
|
||||
|
@ -958,4 +979,16 @@ public final class KoreanTokenizer extends Tokenizer {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isCommonOrInherited(UnicodeScript script) {
|
||||
return script == UnicodeScript.INHERITED ||
|
||||
script == UnicodeScript.COMMON;
|
||||
}
|
||||
|
||||
/** Determine if two scripts are compatible. */
|
||||
private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) {
|
||||
return scriptOne == scriptTwo
|
||||
|| isCommonOrInherited(scriptOne)
|
||||
|| isCommonOrInherited(scriptTwo);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -106,4 +106,4 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[]{1, 1, 1}
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -328,6 +328,44 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
|||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testCombining() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "Ба̀лтичко мо̑ре",
|
||||
new String[]{"Ба̀лтичко", "мо̑ре"},
|
||||
new int[]{0, 10},
|
||||
new int[]{9, 15},
|
||||
new int[]{1, 1}
|
||||
);
|
||||
assertPartsOfSpeech(analyzer, "Ба̀лтичко мо̑ре",
|
||||
new POS.Type[]{POS.Type.MORPHEME, POS.Type.MORPHEME},
|
||||
new POS.Tag[]{POS.Tag.SL, POS.Tag.SL},
|
||||
new POS.Tag[]{POS.Tag.SL, POS.Tag.SL}
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
|
||||
new String[]{"ka̠k̚t͡ɕ͈a̠k̚"},
|
||||
new int[]{0},
|
||||
new int[]{13},
|
||||
new int[]{1}
|
||||
);
|
||||
assertPartsOfSpeech(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
|
||||
new POS.Type[]{POS.Type.MORPHEME},
|
||||
new POS.Tag[]{POS.Tag.SL},
|
||||
new POS.Tag[]{POS.Tag.SL}
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "εἰμί",
|
||||
new String[]{"εἰμί"},
|
||||
new int[]{0},
|
||||
new int[]{4},
|
||||
new int[]{1}
|
||||
);
|
||||
assertPartsOfSpeech(analyzer, "εἰμί",
|
||||
new POS.Type[]{POS.Type.MORPHEME},
|
||||
new POS.Tag[]{POS.Tag.SL},
|
||||
new POS.Tag[]{POS.Tag.SL}
|
||||
);
|
||||
}
|
||||
|
||||
private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException {
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
|
|
Loading…
Reference in New Issue