From 55858d7ba72f857ded79035430855e511a8e319d Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 23 May 2018 16:12:43 +0200 Subject: [PATCH] LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs. --- lucene/CHANGES.txt | 3 ++ .../lucene/analysis/cn/smart/CharType.java | 5 ++ .../lucene/analysis/cn/smart/Utility.java | 4 ++ .../analysis/cn/smart/hhmm/HHMMSegmenter.java | 18 +++++-- .../cn/smart/TestSmartChineseAnalyzer.java | 53 +++++++++++++++---- 5 files changed, 68 insertions(+), 15 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ba282d10ff8..54a8fbadea8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -232,6 +232,9 @@ Bug Fixes * LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock. (Nhat Nguyen via Simon Willnauer) +* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs. + (chengpohi via Jim Ferenczi) + Other * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss) diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java index 4ad58779622..d576809bbc7 100644 --- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java +++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java @@ -62,4 +62,9 @@ public class CharType { */ public final static int OTHER = 7; + /** + * Surrogate character + */ + public final static int SURROGATE = 8; + } diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java index 81ca52e2c21..1d6eeb9ec37 100644 --- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java +++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.cn.smart; import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc +import static java.lang.Character.isSurrogate; + /** * SmartChineseAnalyzer utility constants and methods * @lucene.experimental @@ -152,6 +154,8 @@ public class Utility { * @see CharType */ public static int getCharType(char ch) { + if (isSurrogate(ch)) + return CharType.SURROGATE; // Most (but not all!) of these are Han Ideographic Characters if (ch >= 0x4E00 && ch <= 0x9FA5) return CharType.HANZI; diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java index bd691900e12..4d4cd4492e8 100644 --- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java +++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java @@ -21,7 +21,6 @@ import java.util.List; import org.apache.lucene.analysis.cn.smart.CharType; import org.apache.lucene.analysis.cn.smart.Utility; import org.apache.lucene.analysis.cn.smart.WordType; -import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link /** * Finds the optimal segmentation of a sentence into Chinese words @@ -33,7 +32,7 @@ public class HHMMSegmenter { /** * Create the {@link SegGraph} for a sentence. - * + * * @param sentence input sentence, without start and end markers * @return {@link SegGraph} corresponding to the input sentence. */ @@ -57,11 +56,20 @@ public class HHMMSegmenter { case CharType.SPACE_LIKE: i++; break; + case CharType.SURROGATE: + int state = Character.codePointAt(sentence, i); + int count = Character.charCount(state); + charArray = new char[count]; + sentence.getChars(i, i + count, charArray, 0); + token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0); + segGraph.addToken(token); + i += count; + break; case CharType.HANZI: j = i + 1; wordBuf.delete(0, wordBuf.length()); - // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, - // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will + // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, + // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will // cause word division. wordBuf.append(sentence.charAt(i)); charArray = new char[] { sentence.charAt(i) }; @@ -175,7 +183,7 @@ public class HHMMSegmenter { /** * Get the character types for every character in a sentence. - * + * * @see Utility#getCharType(char) * @param sentence input sentence * @return array of character types corresponding to character positions in the sentence diff --git a/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java b/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java index 6460fbd8423..93db8a3b489 100644 --- a/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java +++ b/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java @@ -16,13 +16,16 @@ */ package org.apache.lucene.analysis.cn.smart; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import java.util.stream.Collectors; +import java.util.stream.Stream; + import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.IOUtils; public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { - + public void testChineseStopWordsDefault() throws Exception { Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ String sentence = "我购买了道具和服装。"; @@ -46,7 +49,37 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { assertAnalyzesTo(ca, sentence, result); ca.close(); } - + + /* + * This test is for test smartcn HHMMSegmenter should correctly handle surrogate character. + */ + public void testSurrogatePairCharacter() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ + String sentence = + Stream.of( + "\uD872\uDF3B", + "\uD872\uDF4A", + "\uD872\uDF73", + "\uD872\uDF5B", + "\u9FCF", + "\uD86D\uDFFC", + "\uD872\uDF2D", + "\u9FD4") + .collect(Collectors.joining()); + String result[] = { + "\uD872\uDF3B", + "\uD872\uDF4A", + "\uD872\uDF73", + "\uD872\uDF5B", + "\u9FCF", + "\uD86D\uDFFC", + "\uD872\uDF2D", + "\u9FD4" + }; + assertAnalyzesTo(ca, sentence, result); + ca.close(); + } + /* * This test is the same as the above, except using an ideographic space as a separator. * This tests to ensure the stopwords are working correctly. @@ -166,7 +199,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" }); analyzer.close(); } - + public void testOffsets() throws Exception { Analyzer analyzer = new SmartChineseAnalyzer(true); assertAnalyzesTo(analyzer, "我购买了道具和服装", @@ -175,10 +208,10 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { new int[] { 1, 3, 4, 6, 7, 9 }); analyzer.close(); } - + public void testReusableTokenStream() throws Exception { Analyzer a = new SmartChineseAnalyzer(); - assertAnalyzesTo(a, "我购买 Tests 了道具和服装", + assertAnalyzesTo(a, "我购买 Tests 了道具和服装", new String[] { "我", "购买", "test", "了", "道具", "和", "服装"}, new int[] { 0, 1, 4, 10, 11, 13, 14 }, new int[] { 1, 3, 9, 11, 13, 14, 16 }); @@ -188,7 +221,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { new int[] { 1, 3, 4, 6, 7, 9 }); a.close(); } - + // LUCENE-3026 public void testLargeDocument() throws Exception { StringBuilder sb = new StringBuilder(); @@ -203,7 +236,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { stream.end(); } } - + // LUCENE-3026 public void testLargeSentence() throws Exception { StringBuilder sb = new StringBuilder(); @@ -218,14 +251,14 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { stream.end(); } } - + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer analyzer = new SmartChineseAnalyzer(); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); analyzer.close(); } - + /** blast some random large strings through the analyzer */ public void testRandomHugeStrings() throws Exception { Analyzer analyzer = new SmartChineseAnalyzer();