mirror of https://github.com/apache/lucene.git
LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
This commit is contained in:
parent
14a7cd1159
commit
55858d7ba7
|
@ -232,6 +232,9 @@ Bug Fixes
|
|||
* LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock.
|
||||
(Nhat Nguyen via Simon Willnauer)
|
||||
|
||||
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
|
||||
(chengpohi via Jim Ferenczi)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
|
||||
|
|
|
@ -62,4 +62,9 @@ public class CharType {
|
|||
*/
|
||||
public final static int OTHER = 7;
|
||||
|
||||
/**
|
||||
* Surrogate character
|
||||
*/
|
||||
public final static int SURROGATE = 8;
|
||||
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.cn.smart;
|
|||
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
|
||||
|
||||
import static java.lang.Character.isSurrogate;
|
||||
|
||||
/**
|
||||
* SmartChineseAnalyzer utility constants and methods
|
||||
* @lucene.experimental
|
||||
|
@ -152,6 +154,8 @@ public class Utility {
|
|||
* @see CharType
|
||||
*/
|
||||
public static int getCharType(char ch) {
|
||||
if (isSurrogate(ch))
|
||||
return CharType.SURROGATE;
|
||||
// Most (but not all!) of these are Han Ideographic Characters
|
||||
if (ch >= 0x4E00 && ch <= 0x9FA5)
|
||||
return CharType.HANZI;
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.util.List;
|
|||
import org.apache.lucene.analysis.cn.smart.CharType;
|
||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||
import org.apache.lucene.analysis.cn.smart.WordType;
|
||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
|
||||
|
||||
/**
|
||||
* Finds the optimal segmentation of a sentence into Chinese words
|
||||
|
@ -33,7 +32,7 @@ public class HHMMSegmenter {
|
|||
|
||||
/**
|
||||
* Create the {@link SegGraph} for a sentence.
|
||||
*
|
||||
*
|
||||
* @param sentence input sentence, without start and end markers
|
||||
* @return {@link SegGraph} corresponding to the input sentence.
|
||||
*/
|
||||
|
@ -57,11 +56,20 @@ public class HHMMSegmenter {
|
|||
case CharType.SPACE_LIKE:
|
||||
i++;
|
||||
break;
|
||||
case CharType.SURROGATE:
|
||||
int state = Character.codePointAt(sentence, i);
|
||||
int count = Character.charCount(state);
|
||||
charArray = new char[count];
|
||||
sentence.getChars(i, i + count, charArray, 0);
|
||||
token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0);
|
||||
segGraph.addToken(token);
|
||||
i += count;
|
||||
break;
|
||||
case CharType.HANZI:
|
||||
j = i + 1;
|
||||
wordBuf.delete(0, wordBuf.length());
|
||||
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
|
||||
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
|
||||
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
|
||||
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
|
||||
// cause word division.
|
||||
wordBuf.append(sentence.charAt(i));
|
||||
charArray = new char[] { sentence.charAt(i) };
|
||||
|
@ -175,7 +183,7 @@ public class HHMMSegmenter {
|
|||
|
||||
/**
|
||||
* Get the character types for every character in a sentence.
|
||||
*
|
||||
*
|
||||
* @see Utility#getCharType(char)
|
||||
* @param sentence input sentence
|
||||
* @return array of character types corresponding to character positions in the sentence
|
||||
|
|
|
@ -16,13 +16,16 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
|
||||
public void testChineseStopWordsDefault() throws Exception {
|
||||
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
|
||||
String sentence = "我购买了道具和服装。";
|
||||
|
@ -46,7 +49,37 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(ca, sentence, result);
|
||||
ca.close();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This test is for test smartcn HHMMSegmenter should correctly handle surrogate character.
|
||||
*/
|
||||
public void testSurrogatePairCharacter() throws Exception {
|
||||
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
|
||||
String sentence =
|
||||
Stream.of(
|
||||
"\uD872\uDF3B",
|
||||
"\uD872\uDF4A",
|
||||
"\uD872\uDF73",
|
||||
"\uD872\uDF5B",
|
||||
"\u9FCF",
|
||||
"\uD86D\uDFFC",
|
||||
"\uD872\uDF2D",
|
||||
"\u9FD4")
|
||||
.collect(Collectors.joining());
|
||||
String result[] = {
|
||||
"\uD872\uDF3B",
|
||||
"\uD872\uDF4A",
|
||||
"\uD872\uDF73",
|
||||
"\uD872\uDF5B",
|
||||
"\u9FCF",
|
||||
"\uD86D\uDFFC",
|
||||
"\uD872\uDF2D",
|
||||
"\u9FD4"
|
||||
};
|
||||
assertAnalyzesTo(ca, sentence, result);
|
||||
ca.close();
|
||||
}
|
||||
|
||||
/*
|
||||
* This test is the same as the above, except using an ideographic space as a separator.
|
||||
* This tests to ensure the stopwords are working correctly.
|
||||
|
@ -166,7 +199,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
Analyzer analyzer = new SmartChineseAnalyzer(true);
|
||||
assertAnalyzesTo(analyzer, "我购买了道具和服装",
|
||||
|
@ -175,10 +208,10 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 3, 4, 6, 7, 9 });
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SmartChineseAnalyzer();
|
||||
assertAnalyzesTo(a, "我购买 Tests 了道具和服装",
|
||||
assertAnalyzesTo(a, "我购买 Tests 了道具和服装",
|
||||
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"},
|
||||
new int[] { 0, 1, 4, 10, 11, 13, 14 },
|
||||
new int[] { 1, 3, 9, 11, 13, 14, 16 });
|
||||
|
@ -188,7 +221,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 3, 4, 6, 7, 9 });
|
||||
a.close();
|
||||
}
|
||||
|
||||
|
||||
// LUCENE-3026
|
||||
public void testLargeDocument() throws Exception {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -203,7 +236,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// LUCENE-3026
|
||||
public void testLargeSentence() throws Exception {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -218,14 +251,14 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer analyzer = new SmartChineseAnalyzer();
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Analyzer analyzer = new SmartChineseAnalyzer();
|
||||
|
|
Loading…
Reference in New Issue