LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and

detects script boundaries more accurately with Character#UnicodeScript#of. Signed-off-by: Jim Ferenczi <jimczi@apache.org>
2018-11-21 13:58:44 +01:00 · 2018-11-21 13:58:44 +01:00 · 643ffc6f9f
parent b6b9f9554e
commit 643ffc6f9f
4 changed files with 87 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -214,6 +214,12 @@ Build

 * LUCENE-8537: ant test command fails under lucene/tools (Peter Somogyi)

+Bug fixes:
+
+* LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and
+  detects script boundaries more accurately with Character#UnicodeScript#of.
+  (Christophe Bismuth, Jim Ferenczi)
+
 New Features

 * LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@ -43,6 +43,8 @@ import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.FST;

+import static java.lang.Character.UnicodeScript;
+
 /**
 * Tokenizer for Korean that uses morphological analysis.
 * <p>
@ -718,27 +720,42 @@ public final class KoreanTokenizer extends Tokenizer {
      if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {

        // Find unknown match:
-        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
-        final boolean isPunct = isPunctuation(firstCharacter);
-
+        int characterId = characterDefinition.getCharacterClass(firstCharacter);
        // NOTE: copied from UnknownDictionary.lookup:
        int unknownWordLength;
        if (!characterDefinition.isGroup(firstCharacter)) {
          unknownWordLength = 1;
        } else {
-          // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+          // Extract unknown word. Characters with the same script are considered to be part of unknown word
          unknownWordLength = 1;
+          UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
+          final boolean isPunct = isPunctuation(firstCharacter);
          for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
-            final int ch = buffer.get(posAhead);
-            if (ch == -1) {
+            int next = buffer.get(posAhead);
+            if (next == -1) {
              break;
            }
-            if (characterId == characterDefinition.getCharacterClass((char) ch) &&
-                isPunctuation((char) ch) == isPunct) {
+            char ch = (char) next;
+            int chType = Character.getType(ch);
+            UnicodeScript sc = UnicodeScript.of(next);
+            boolean sameScript = isSameScript(scriptCode, sc)
+                // Non-spacing marks inherit the script of their base character,
+                // following recommendations from UTR #24.
+                || chType == Character.NON_SPACING_MARK;
+
+            if (sameScript
+                  && isPunctuation(ch, chType) == isPunct
+                  && characterDefinition.isGroup(ch)) {
              unknownWordLength++;
            } else {
              break;
            }
+            // Update the script code and character class if the original script
+            // is Inherited or Common.
+            if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
+              scriptCode = sc;
+              characterId = characterDefinition.getCharacterClass(ch);
+            }
          }
        }

@ -932,11 +949,15 @@ public final class KoreanTokenizer extends Tokenizer {
  }

  private static boolean isPunctuation(char ch) {
+    return isPunctuation(ch, Character.getType(ch));
+  }
+
+  private static boolean isPunctuation(char ch, int cid) {
    // special case for Hangul Letter Araea (interpunct)
    if (ch == 0x318D) {
      return true;
    }
-    switch(Character.getType(ch)) {
+    switch(cid) {
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
      case Character.PARAGRAPH_SEPARATOR:
@ -958,4 +979,16 @@ public final class KoreanTokenizer extends Tokenizer {
        return false;
    }
  }
+
+  private static boolean isCommonOrInherited(UnicodeScript script) {
+    return script == UnicodeScript.INHERITED ||
+        script == UnicodeScript.COMMON;
+  }
+
+  /** Determine if two scripts are compatible. */
+  private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) {
+    return scriptOne == scriptTwo
+        || isCommonOrInherited(scriptOne)
+        || isCommonOrInherited(scriptTwo);
+  }
 }
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
@ -106,4 +106,4 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
        new int[]{1, 1, 1}
    );
  }
-}
+}
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@ -328,6 +328,44 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
    analyzer.close();
  }

+  public void testCombining() throws IOException {
+    assertAnalyzesTo(analyzer, "Ба̀лтичко мо̑ре",
+        new String[]{"Ба̀лтичко", "мо̑ре"},
+        new int[]{0, 10},
+        new int[]{9, 15},
+        new int[]{1, 1}
+    );
+    assertPartsOfSpeech(analyzer, "Ба̀лтичко мо̑ре",
+        new POS.Type[]{POS.Type.MORPHEME, POS.Type.MORPHEME},
+        new POS.Tag[]{POS.Tag.SL, POS.Tag.SL},
+        new POS.Tag[]{POS.Tag.SL, POS.Tag.SL}
+    );
+
+    assertAnalyzesTo(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
+        new String[]{"ka̠k̚t͡ɕ͈a̠k̚"},
+        new int[]{0},
+        new int[]{13},
+        new int[]{1}
+    );
+    assertPartsOfSpeech(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
+        new POS.Type[]{POS.Type.MORPHEME},
+        new POS.Tag[]{POS.Tag.SL},
+        new POS.Tag[]{POS.Tag.SL}
+    );
+
+    assertAnalyzesTo(analyzer, "εἰμί",
+        new String[]{"εἰμί"},
+        new int[]{0},
+        new int[]{4},
+        new int[]{1}
+    );
+    assertPartsOfSpeech(analyzer, "εἰμί",
+        new POS.Type[]{POS.Type.MORPHEME},
+        new POS.Tag[]{POS.Tag.SL},
+        new POS.Tag[]{POS.Tag.SL}
+    );
+  }
+
  private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException {
    try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
      ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);