diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 469d6fb881a..6b001b941f5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -214,6 +214,12 @@ Build * LUCENE-8537: ant test command fails under lucene/tools (Peter Somogyi) +Bug fixes: + +* LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and + detects script boundaries more accurately with Character#UnicodeScript#of. + (Christophe Bismuth, Jim Ferenczi) + New Features * LUCENE-8026: ExitableDirectoryReader may now time out queries that run on diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java index ab3205f212e..012352c8687 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java @@ -43,6 +43,8 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST; +import static java.lang.Character.UnicodeScript; + /** * Tokenizer for Korean that uses morphological analysis. *

@@ -718,27 +720,42 @@ public final class KoreanTokenizer extends Tokenizer { if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) { // Find unknown match: - final int characterId = characterDefinition.getCharacterClass(firstCharacter); - final boolean isPunct = isPunctuation(firstCharacter); - + int characterId = characterDefinition.getCharacterClass(firstCharacter); // NOTE: copied from UnknownDictionary.lookup: int unknownWordLength; if (!characterDefinition.isGroup(firstCharacter)) { unknownWordLength = 1; } else { - // Extract unknown word. Characters with the same character class are considered to be part of unknown word + // Extract unknown word. Characters with the same script are considered to be part of unknown word unknownWordLength = 1; + UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter); + final boolean isPunct = isPunctuation(firstCharacter); for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) { - final int ch = buffer.get(posAhead); - if (ch == -1) { + int next = buffer.get(posAhead); + if (next == -1) { break; } - if (characterId == characterDefinition.getCharacterClass((char) ch) && - isPunctuation((char) ch) == isPunct) { + char ch = (char) next; + int chType = Character.getType(ch); + UnicodeScript sc = UnicodeScript.of(next); + boolean sameScript = isSameScript(scriptCode, sc) + // Non-spacing marks inherit the script of their base character, + // following recommendations from UTR #24. + || chType == Character.NON_SPACING_MARK; + + if (sameScript + && isPunctuation(ch, chType) == isPunct + && characterDefinition.isGroup(ch)) { unknownWordLength++; } else { break; } + // Update the script code and character class if the original script + // is Inherited or Common. + if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) { + scriptCode = sc; + characterId = characterDefinition.getCharacterClass(ch); + } } } @@ -932,11 +949,15 @@ public final class KoreanTokenizer extends Tokenizer { } private static boolean isPunctuation(char ch) { + return isPunctuation(ch, Character.getType(ch)); + } + + private static boolean isPunctuation(char ch, int cid) { // special case for Hangul Letter Araea (interpunct) if (ch == 0x318D) { return true; } - switch(Character.getType(ch)) { + switch(cid) { case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: @@ -958,4 +979,16 @@ public final class KoreanTokenizer extends Tokenizer { return false; } } + + private static boolean isCommonOrInherited(UnicodeScript script) { + return script == UnicodeScript.INHERITED || + script == UnicodeScript.COMMON; + } + + /** Determine if two scripts are compatible. */ + private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) { + return scriptOne == scriptTwo + || isCommonOrInherited(scriptOne) + || isCommonOrInherited(scriptTwo); + } } diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java index fd574cede49..a56047a7989 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java @@ -106,4 +106,4 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase { new int[]{1, 1, 1} ); } -} \ No newline at end of file +} diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java index 7c204fa2e3d..50104ff01bd 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java @@ -328,6 +328,44 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase { analyzer.close(); } + public void testCombining() throws IOException { + assertAnalyzesTo(analyzer, "Ба̀лтичко мо̑ре", + new String[]{"Ба̀лтичко", "мо̑ре"}, + new int[]{0, 10}, + new int[]{9, 15}, + new int[]{1, 1} + ); + assertPartsOfSpeech(analyzer, "Ба̀лтичко мо̑ре", + new POS.Type[]{POS.Type.MORPHEME, POS.Type.MORPHEME}, + new POS.Tag[]{POS.Tag.SL, POS.Tag.SL}, + new POS.Tag[]{POS.Tag.SL, POS.Tag.SL} + ); + + assertAnalyzesTo(analyzer, "ka̠k̚t͡ɕ͈a̠k̚", + new String[]{"ka̠k̚t͡ɕ͈a̠k̚"}, + new int[]{0}, + new int[]{13}, + new int[]{1} + ); + assertPartsOfSpeech(analyzer, "ka̠k̚t͡ɕ͈a̠k̚", + new POS.Type[]{POS.Type.MORPHEME}, + new POS.Tag[]{POS.Tag.SL}, + new POS.Tag[]{POS.Tag.SL} + ); + + assertAnalyzesTo(analyzer, "εἰμί", + new String[]{"εἰμί"}, + new int[]{0}, + new int[]{4}, + new int[]{1} + ); + assertPartsOfSpeech(analyzer, "εἰμί", + new POS.Type[]{POS.Type.MORPHEME}, + new POS.Tag[]{POS.Tag.SL}, + new POS.Tag[]{POS.Tag.SL} + ); + } + private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException { try (TokenStream ts = analyzer.tokenStream("ignored", input)) { ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);