LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and

detects script boundaries more accurately with Character#UnicodeScript#of.

Signed-off-by: Jim Ferenczi <jimczi@apache.org>
This commit is contained in:
Christophe Bismuth 2018-11-21 13:58:44 +01:00 committed by Jim Ferenczi
parent b6b9f9554e
commit 643ffc6f9f
4 changed files with 87 additions and 10 deletions

View File

@ -214,6 +214,12 @@ Build
* LUCENE-8537: ant test command fails under lucene/tools (Peter Somogyi)
Bug fixes:
* LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and
detects script boundaries more accurately with Character#UnicodeScript#of.
(Christophe Bismuth, Jim Ferenczi)
New Features
* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on

View File

@ -43,6 +43,8 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
import static java.lang.Character.UnicodeScript;
/**
* Tokenizer for Korean that uses morphological analysis.
* <p>
@ -718,27 +720,42 @@ public final class KoreanTokenizer extends Tokenizer {
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
// Find unknown match:
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
int characterId = characterDefinition.getCharacterClass(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
if (!characterDefinition.isGroup(firstCharacter)) {
unknownWordLength = 1;
} else {
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
// Extract unknown word. Characters with the same script are considered to be part of unknown word
unknownWordLength = 1;
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
int next = buffer.get(posAhead);
if (next == -1) {
break;
}
if (characterId == characterDefinition.getCharacterClass((char) ch) &&
isPunctuation((char) ch) == isPunct) {
char ch = (char) next;
int chType = Character.getType(ch);
UnicodeScript sc = UnicodeScript.of(next);
boolean sameScript = isSameScript(scriptCode, sc)
// Non-spacing marks inherit the script of their base character,
// following recommendations from UTR #24.
|| chType == Character.NON_SPACING_MARK;
if (sameScript
&& isPunctuation(ch, chType) == isPunct
&& characterDefinition.isGroup(ch)) {
unknownWordLength++;
} else {
break;
}
// Update the script code and character class if the original script
// is Inherited or Common.
if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
scriptCode = sc;
characterId = characterDefinition.getCharacterClass(ch);
}
}
}
@ -932,11 +949,15 @@ public final class KoreanTokenizer extends Tokenizer {
}
private static boolean isPunctuation(char ch) {
return isPunctuation(ch, Character.getType(ch));
}
private static boolean isPunctuation(char ch, int cid) {
// special case for Hangul Letter Araea (interpunct)
if (ch == 0x318D) {
return true;
}
switch(Character.getType(ch)) {
switch(cid) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
@ -958,4 +979,16 @@ public final class KoreanTokenizer extends Tokenizer {
return false;
}
}
private static boolean isCommonOrInherited(UnicodeScript script) {
return script == UnicodeScript.INHERITED ||
script == UnicodeScript.COMMON;
}
/** Determine if two scripts are compatible. */
private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) {
return scriptOne == scriptTwo
|| isCommonOrInherited(scriptOne)
|| isCommonOrInherited(scriptTwo);
}
}

View File

@ -106,4 +106,4 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
new int[]{1, 1, 1}
);
}
}
}

View File

@ -328,6 +328,44 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
analyzer.close();
}
public void testCombining() throws IOException {
assertAnalyzesTo(analyzer, "Ба̀лтичко мо̑ре",
new String[]{"Ба̀лтичко", "мо̑ре"},
new int[]{0, 10},
new int[]{9, 15},
new int[]{1, 1}
);
assertPartsOfSpeech(analyzer, "Ба̀лтичко мо̑ре",
new POS.Type[]{POS.Type.MORPHEME, POS.Type.MORPHEME},
new POS.Tag[]{POS.Tag.SL, POS.Tag.SL},
new POS.Tag[]{POS.Tag.SL, POS.Tag.SL}
);
assertAnalyzesTo(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
new String[]{"ka̠k̚t͡ɕ͈a̠k̚"},
new int[]{0},
new int[]{13},
new int[]{1}
);
assertPartsOfSpeech(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
new POS.Type[]{POS.Type.MORPHEME},
new POS.Tag[]{POS.Tag.SL},
new POS.Tag[]{POS.Tag.SL}
);
assertAnalyzesTo(analyzer, "εἰμί",
new String[]{"εἰμί"},
new int[]{0},
new int[]{4},
new int[]{1}
);
assertPartsOfSpeech(analyzer, "εἰμί",
new POS.Type[]{POS.Type.MORPHEME},
new POS.Tag[]{POS.Tag.SL},
new POS.Tag[]{POS.Tag.SL}
);
}
private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException {
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);