LUCENE-8966: The Korean analyzer now splits tokens on boundaries between digits and alphabetic characters.

This commit is contained in:
jimczi 2019-09-13 09:57:21 +02:00
parent 89c54ec0d1
commit c4815f04c0
3 changed files with 22 additions and 0 deletions

View File

@ -75,6 +75,8 @@ Improvements
* LUCENE-8976: Use exact distance between point and bounding rectangle in FloatPointNearestNeighbor. (Ignacio Vera)
* LUCENE-8966: The Korean analyzer now splits tokens on boundaries between digits and alphabetic characters. (Jim Ferenczi)
Optimizations
* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip

View File

@ -760,6 +760,7 @@ public final class KoreanTokenizer extends Tokenizer {
unknownWordLength = 1;
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
final boolean isDigit = Character.isDigit(firstCharacter);
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
int next = buffer.get(posAhead);
if (next == -1) {
@ -774,7 +775,10 @@ public final class KoreanTokenizer extends Tokenizer {
|| chType == Character.NON_SPACING_MARK;
if (sameScript
// split on punctuation
&& isPunctuation(ch, chType) == isPunct
// split on digit
&& Character.isDigit(ch) == isDigit
&& characterDefinition.isGroup(ch)) {
unknownWordLength++;
} else {

View File

@ -108,6 +108,22 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
};
}
public void testSeparateNumber() throws IOException {
assertAnalyzesTo(analyzer, "44사이즈",
new String[]{"44", "사이즈"},
new int[]{0, 2},
new int[]{2, 5},
new int[]{1, 1}
);
assertAnalyzesTo(analyzer, ".9사이즈",
new String[]{"", "", "사이즈"},
new int[]{0, 2, 3},
new int[]{1, 3, 6},
new int[]{1, 1, 1}
);
}
public void testSpaces() throws IOException {
assertAnalyzesTo(analyzer, "화학 이외의 것",
new String[]{"화학", "이외", "", ""},