LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.

2018-05-23 16:12:43 +02:00 · 2018-05-23 16:12:43 +02:00 · 55858d7ba7
parent 14a7cd1159
commit 55858d7ba7
5 changed files with 68 additions and 15 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -232,6 +232,9 @@ Bug Fixes
 * LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock.
  (Nhat Nguyen via Simon Willnauer)

+* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
+  (chengpohi via Jim Ferenczi)
+
 Other

 * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
@ -62,4 +62,9 @@ public class CharType {
   */
  public final static int OTHER = 7;

+  /**
+   * Surrogate character
+   */
+  public final static int SURROGATE = 8;
+
 }
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.cn.smart;

 import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc

+import static java.lang.Character.isSurrogate;
+
 /**
 * SmartChineseAnalyzer utility constants and methods
 * @lucene.experimental
@ -152,6 +154,8 @@ public class Utility {
   * @see CharType
   */
  public static int getCharType(char ch) {
+    if (isSurrogate(ch))
+      return CharType.SURROGATE;
    // Most (but not all!) of these are Han Ideographic Characters
    if (ch >= 0x4E00 && ch <= 0x9FA5)
      return CharType.HANZI;
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
@ -21,7 +21,6 @@ import java.util.List;
 import org.apache.lucene.analysis.cn.smart.CharType;
 import org.apache.lucene.analysis.cn.smart.Utility;
 import org.apache.lucene.analysis.cn.smart.WordType;
-import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link

 /**
 * Finds the optimal segmentation of a sentence into Chinese words
@ -57,6 +56,15 @@ public class HHMMSegmenter {
        case CharType.SPACE_LIKE:
          i++;
          break;
+        case CharType.SURROGATE:
+          int state = Character.codePointAt(sentence, i);
+          int count = Character.charCount(state);
+          charArray = new char[count];
+          sentence.getChars(i, i + count, charArray, 0);
+          token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0);
+          segGraph.addToken(token);
+          i += count;
+          break;
        case CharType.HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
--- a/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
+++ b/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
@ -16,8 +16,11 @@
 */
 package org.apache.lucene.analysis.cn.smart;

-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.IOUtils;

@ -47,6 +50,36 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
    ca.close();
  }

+  /*
+   * This test is for test smartcn HHMMSegmenter should correctly handle surrogate character.
+   */
+  public void testSurrogatePairCharacter() throws Exception {
+    Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
+    String sentence =
+        Stream.of(
+                "\uD872\uDF3B",
+                "\uD872\uDF4A",
+                "\uD872\uDF73",
+                "\uD872\uDF5B",
+                "\u9FCF",
+                "\uD86D\uDFFC",
+                "\uD872\uDF2D",
+                "\u9FD4")
+            .collect(Collectors.joining());
+    String result[] = {
+      "\uD872\uDF3B",
+      "\uD872\uDF4A",
+      "\uD872\uDF73",
+      "\uD872\uDF5B",
+      "\u9FCF",
+      "\uD86D\uDFFC",
+      "\uD872\uDF2D",
+      "\u9FD4"
+    };
+    assertAnalyzesTo(ca, sentence, result);
+    ca.close();
+  }
+
  /*
   * This test is the same as the above, except using an ideographic space as a separator.
   * This tests to ensure the stopwords are working correctly.