mirror of https://github.com/apache/lucene.git
LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
This commit is contained in:
parent
14a7cd1159
commit
55858d7ba7
|
@ -232,6 +232,9 @@ Bug Fixes
|
||||||
* LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock.
|
* LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock.
|
||||||
(Nhat Nguyen via Simon Willnauer)
|
(Nhat Nguyen via Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
|
||||||
|
(chengpohi via Jim Ferenczi)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
|
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
|
||||||
|
|
|
@ -62,4 +62,9 @@ public class CharType {
|
||||||
*/
|
*/
|
||||||
public final static int OTHER = 7;
|
public final static int OTHER = 7;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Surrogate character
|
||||||
|
*/
|
||||||
|
public final static int SURROGATE = 8;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.cn.smart;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
|
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
|
||||||
|
|
||||||
|
import static java.lang.Character.isSurrogate;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SmartChineseAnalyzer utility constants and methods
|
* SmartChineseAnalyzer utility constants and methods
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
|
@ -152,6 +154,8 @@ public class Utility {
|
||||||
* @see CharType
|
* @see CharType
|
||||||
*/
|
*/
|
||||||
public static int getCharType(char ch) {
|
public static int getCharType(char ch) {
|
||||||
|
if (isSurrogate(ch))
|
||||||
|
return CharType.SURROGATE;
|
||||||
// Most (but not all!) of these are Han Ideographic Characters
|
// Most (but not all!) of these are Han Ideographic Characters
|
||||||
if (ch >= 0x4E00 && ch <= 0x9FA5)
|
if (ch >= 0x4E00 && ch <= 0x9FA5)
|
||||||
return CharType.HANZI;
|
return CharType.HANZI;
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.util.List;
|
||||||
import org.apache.lucene.analysis.cn.smart.CharType;
|
import org.apache.lucene.analysis.cn.smart.CharType;
|
||||||
import org.apache.lucene.analysis.cn.smart.Utility;
|
import org.apache.lucene.analysis.cn.smart.Utility;
|
||||||
import org.apache.lucene.analysis.cn.smart.WordType;
|
import org.apache.lucene.analysis.cn.smart.WordType;
|
||||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds the optimal segmentation of a sentence into Chinese words
|
* Finds the optimal segmentation of a sentence into Chinese words
|
||||||
|
@ -57,6 +56,15 @@ public class HHMMSegmenter {
|
||||||
case CharType.SPACE_LIKE:
|
case CharType.SPACE_LIKE:
|
||||||
i++;
|
i++;
|
||||||
break;
|
break;
|
||||||
|
case CharType.SURROGATE:
|
||||||
|
int state = Character.codePointAt(sentence, i);
|
||||||
|
int count = Character.charCount(state);
|
||||||
|
charArray = new char[count];
|
||||||
|
sentence.getChars(i, i + count, charArray, 0);
|
||||||
|
token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0);
|
||||||
|
segGraph.addToken(token);
|
||||||
|
i += count;
|
||||||
|
break;
|
||||||
case CharType.HANZI:
|
case CharType.HANZI:
|
||||||
j = i + 1;
|
j = i + 1;
|
||||||
wordBuf.delete(0, wordBuf.length());
|
wordBuf.delete(0, wordBuf.length());
|
||||||
|
|
|
@ -16,8 +16,11 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.cn.smart;
|
package org.apache.lucene.analysis.cn.smart;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
@ -47,6 +50,36 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
ca.close();
|
ca.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This test is for test smartcn HHMMSegmenter should correctly handle surrogate character.
|
||||||
|
*/
|
||||||
|
public void testSurrogatePairCharacter() throws Exception {
|
||||||
|
Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
|
||||||
|
String sentence =
|
||||||
|
Stream.of(
|
||||||
|
"\uD872\uDF3B",
|
||||||
|
"\uD872\uDF4A",
|
||||||
|
"\uD872\uDF73",
|
||||||
|
"\uD872\uDF5B",
|
||||||
|
"\u9FCF",
|
||||||
|
"\uD86D\uDFFC",
|
||||||
|
"\uD872\uDF2D",
|
||||||
|
"\u9FD4")
|
||||||
|
.collect(Collectors.joining());
|
||||||
|
String result[] = {
|
||||||
|
"\uD872\uDF3B",
|
||||||
|
"\uD872\uDF4A",
|
||||||
|
"\uD872\uDF73",
|
||||||
|
"\uD872\uDF5B",
|
||||||
|
"\u9FCF",
|
||||||
|
"\uD86D\uDFFC",
|
||||||
|
"\uD872\uDF2D",
|
||||||
|
"\u9FD4"
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(ca, sentence, result);
|
||||||
|
ca.close();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This test is the same as the above, except using an ideographic space as a separator.
|
* This test is the same as the above, except using an ideographic space as a separator.
|
||||||
* This tests to ensure the stopwords are working correctly.
|
* This tests to ensure the stopwords are working correctly.
|
||||||
|
|
Loading…
Reference in New Issue