LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.

This commit is contained in:
Jim Ferenczi 2018-05-23 16:12:43 +02:00
parent 14a7cd1159
commit 55858d7ba7
5 changed files with 68 additions and 15 deletions

View File

@ -232,6 +232,9 @@ Bug Fixes
* LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock. * LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock.
(Nhat Nguyen via Simon Willnauer) (Nhat Nguyen via Simon Willnauer)
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
(chengpohi via Jim Ferenczi)
Other Other
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss) * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)

View File

@ -62,4 +62,9 @@ public class CharType {
*/ */
public final static int OTHER = 7; public final static int OTHER = 7;
/**
* Surrogate character
*/
public final static int SURROGATE = 8;
} }

View File

@ -18,6 +18,8 @@ package org.apache.lucene.analysis.cn.smart;
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
import static java.lang.Character.isSurrogate;
/** /**
* SmartChineseAnalyzer utility constants and methods * SmartChineseAnalyzer utility constants and methods
* @lucene.experimental * @lucene.experimental
@ -152,6 +154,8 @@ public class Utility {
* @see CharType * @see CharType
*/ */
public static int getCharType(char ch) { public static int getCharType(char ch) {
if (isSurrogate(ch))
return CharType.SURROGATE;
// Most (but not all!) of these are Han Ideographic Characters // Most (but not all!) of these are Han Ideographic Characters
if (ch >= 0x4E00 && ch <= 0x9FA5) if (ch >= 0x4E00 && ch <= 0x9FA5)
return CharType.HANZI; return CharType.HANZI;

View File

@ -21,7 +21,6 @@ import java.util.List;
import org.apache.lucene.analysis.cn.smart.CharType; import org.apache.lucene.analysis.cn.smart.CharType;
import org.apache.lucene.analysis.cn.smart.Utility; import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType; import org.apache.lucene.analysis.cn.smart.WordType;
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
/** /**
* Finds the optimal segmentation of a sentence into Chinese words * Finds the optimal segmentation of a sentence into Chinese words
@ -33,7 +32,7 @@ public class HHMMSegmenter {
/** /**
* Create the {@link SegGraph} for a sentence. * Create the {@link SegGraph} for a sentence.
* *
* @param sentence input sentence, without start and end markers * @param sentence input sentence, without start and end markers
* @return {@link SegGraph} corresponding to the input sentence. * @return {@link SegGraph} corresponding to the input sentence.
*/ */
@ -57,11 +56,20 @@ public class HHMMSegmenter {
case CharType.SPACE_LIKE: case CharType.SPACE_LIKE:
i++; i++;
break; break;
case CharType.SURROGATE:
int state = Character.codePointAt(sentence, i);
int count = Character.charCount(state);
charArray = new char[count];
sentence.getChars(i, i + count, charArray, 0);
token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0);
segGraph.addToken(token);
i += count;
break;
case CharType.HANZI: case CharType.HANZI:
j = i + 1; j = i + 1;
wordBuf.delete(0, wordBuf.length()); wordBuf.delete(0, wordBuf.length());
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
// cause word division. // cause word division.
wordBuf.append(sentence.charAt(i)); wordBuf.append(sentence.charAt(i));
charArray = new char[] { sentence.charAt(i) }; charArray = new char[] { sentence.charAt(i) };
@ -175,7 +183,7 @@ public class HHMMSegmenter {
/** /**
* Get the character types for every character in a sentence. * Get the character types for every character in a sentence.
* *
* @see Utility#getCharType(char) * @see Utility#getCharType(char)
* @param sentence input sentence * @param sentence input sentence
* @return array of character types corresponding to character positions in the sentence * @return array of character types corresponding to character positions in the sentence

View File

@ -16,13 +16,16 @@
*/ */
package org.apache.lucene.analysis.cn.smart; package org.apache.lucene.analysis.cn.smart;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
public void testChineseStopWordsDefault() throws Exception { public void testChineseStopWordsDefault() throws Exception {
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
String sentence = "我购买了道具和服装。"; String sentence = "我购买了道具和服装。";
@ -46,7 +49,37 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(ca, sentence, result); assertAnalyzesTo(ca, sentence, result);
ca.close(); ca.close();
} }
/*
* This test is for test smartcn HHMMSegmenter should correctly handle surrogate character.
*/
public void testSurrogatePairCharacter() throws Exception {
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
String sentence =
Stream.of(
"\uD872\uDF3B",
"\uD872\uDF4A",
"\uD872\uDF73",
"\uD872\uDF5B",
"\u9FCF",
"\uD86D\uDFFC",
"\uD872\uDF2D",
"\u9FD4")
.collect(Collectors.joining());
String result[] = {
"\uD872\uDF3B",
"\uD872\uDF4A",
"\uD872\uDF73",
"\uD872\uDF5B",
"\u9FCF",
"\uD86D\uDFFC",
"\uD872\uDF2D",
"\u9FD4"
};
assertAnalyzesTo(ca, sentence, result);
ca.close();
}
/* /*
* This test is the same as the above, except using an ideographic space as a separator. * This test is the same as the above, except using an ideographic space as a separator.
* This tests to ensure the stopwords are working correctly. * This tests to ensure the stopwords are working correctly.
@ -166,7 +199,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
new String[] { "", "", "", "", "", "", "", "" }); new String[] { "", "", "", "", "", "", "", "" });
analyzer.close(); analyzer.close();
} }
public void testOffsets() throws Exception { public void testOffsets() throws Exception {
Analyzer analyzer = new SmartChineseAnalyzer(true); Analyzer analyzer = new SmartChineseAnalyzer(true);
assertAnalyzesTo(analyzer, "我购买了道具和服装", assertAnalyzesTo(analyzer, "我购买了道具和服装",
@ -175,10 +208,10 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
new int[] { 1, 3, 4, 6, 7, 9 }); new int[] { 1, 3, 4, 6, 7, 9 });
analyzer.close(); analyzer.close();
} }
public void testReusableTokenStream() throws Exception { public void testReusableTokenStream() throws Exception {
Analyzer a = new SmartChineseAnalyzer(); Analyzer a = new SmartChineseAnalyzer();
assertAnalyzesTo(a, "我购买 Tests 了道具和服装", assertAnalyzesTo(a, "我购买 Tests 了道具和服装",
new String[] { "", "购买", "test", "", "道具", "", "服装"}, new String[] { "", "购买", "test", "", "道具", "", "服装"},
new int[] { 0, 1, 4, 10, 11, 13, 14 }, new int[] { 0, 1, 4, 10, 11, 13, 14 },
new int[] { 1, 3, 9, 11, 13, 14, 16 }); new int[] { 1, 3, 9, 11, 13, 14, 16 });
@ -188,7 +221,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
new int[] { 1, 3, 4, 6, 7, 9 }); new int[] { 1, 3, 4, 6, 7, 9 });
a.close(); a.close();
} }
// LUCENE-3026 // LUCENE-3026
public void testLargeDocument() throws Exception { public void testLargeDocument() throws Exception {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -203,7 +236,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
stream.end(); stream.end();
} }
} }
// LUCENE-3026 // LUCENE-3026
public void testLargeSentence() throws Exception { public void testLargeSentence() throws Exception {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -218,14 +251,14 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
stream.end(); stream.end();
} }
} }
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
Analyzer analyzer = new SmartChineseAnalyzer(); Analyzer analyzer = new SmartChineseAnalyzer();
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
analyzer.close(); analyzer.close();
} }
/** blast some random large strings through the analyzer */ /** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception { public void testRandomHugeStrings() throws Exception {
Analyzer analyzer = new SmartChineseAnalyzer(); Analyzer analyzer = new SmartChineseAnalyzer();