mirror of https://github.com/apache/lucene.git
LUCENE-8966: The Korean analyzer now splits tokens on boundaries between digits and alphabetic characters.
This commit is contained in:
parent
89c54ec0d1
commit
c4815f04c0
|
@ -75,6 +75,8 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-8976: Use exact distance between point and bounding rectangle in FloatPointNearestNeighbor. (Ignacio Vera)
|
* LUCENE-8976: Use exact distance between point and bounding rectangle in FloatPointNearestNeighbor. (Ignacio Vera)
|
||||||
|
|
||||||
|
* LUCENE-8966: The Korean analyzer now splits tokens on boundaries between digits and alphabetic characters. (Jim Ferenczi)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
|
* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
|
||||||
|
|
|
@ -760,6 +760,7 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
unknownWordLength = 1;
|
unknownWordLength = 1;
|
||||||
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
|
UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
|
||||||
final boolean isPunct = isPunctuation(firstCharacter);
|
final boolean isPunct = isPunctuation(firstCharacter);
|
||||||
|
final boolean isDigit = Character.isDigit(firstCharacter);
|
||||||
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
|
||||||
int next = buffer.get(posAhead);
|
int next = buffer.get(posAhead);
|
||||||
if (next == -1) {
|
if (next == -1) {
|
||||||
|
@ -774,7 +775,10 @@ public final class KoreanTokenizer extends Tokenizer {
|
||||||
|| chType == Character.NON_SPACING_MARK;
|
|| chType == Character.NON_SPACING_MARK;
|
||||||
|
|
||||||
if (sameScript
|
if (sameScript
|
||||||
|
// split on punctuation
|
||||||
&& isPunctuation(ch, chType) == isPunct
|
&& isPunctuation(ch, chType) == isPunct
|
||||||
|
// split on digit
|
||||||
|
&& Character.isDigit(ch) == isDigit
|
||||||
&& characterDefinition.isGroup(ch)) {
|
&& characterDefinition.isGroup(ch)) {
|
||||||
unknownWordLength++;
|
unknownWordLength++;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -108,6 +108,22 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSeparateNumber() throws IOException {
|
||||||
|
assertAnalyzesTo(analyzer, "44사이즈",
|
||||||
|
new String[]{"44", "사이즈"},
|
||||||
|
new int[]{0, 2},
|
||||||
|
new int[]{2, 5},
|
||||||
|
new int[]{1, 1}
|
||||||
|
);
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "9.9사이즈",
|
||||||
|
new String[]{"9", "9", "사이즈"},
|
||||||
|
new int[]{0, 2, 3},
|
||||||
|
new int[]{1, 3, 6},
|
||||||
|
new int[]{1, 1, 1}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public void testSpaces() throws IOException {
|
public void testSpaces() throws IOException {
|
||||||
assertAnalyzesTo(analyzer, "화학 이외의 것",
|
assertAnalyzesTo(analyzer, "화학 이외의 것",
|
||||||
new String[]{"화학", "이외", "의", "것"},
|
new String[]{"화학", "이외", "의", "것"},
|
||||||
|
|
Loading…
Reference in New Issue