mirror of https://github.com/apache/lucene.git
LUCENE-461 - Fix for "StandardTokenizer splitting all of Korean words into separate characters", contributed by Cheolgoo Kang
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332745 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1687a79648
commit
f00afeee7a
|
@ -234,8 +234,11 @@ Bug fixes
|
||||||
the original token.
|
the original token.
|
||||||
(Yonik Seeley via Erik Hatcher, LUCENE-437)
|
(Yonik Seeley via Erik Hatcher, LUCENE-437)
|
||||||
|
|
||||||
12. Added Unicode range to fix tokenization of Korean.
|
12. Added Unicode range of Korean characters to StandardTokenizer,
|
||||||
(Otis, http://issues.apache.org/jira/browse/LUCENE-444)
|
grouping contiguous characters into a token rather than one token
|
||||||
|
per character. This change also changes the token type to "<CJ>"
|
||||||
|
for Chinese and Japanese character tokens (previously it was "<CJK>").
|
||||||
|
(Otis and Erik, via Cheolgoo Kang LUCENE-444 and LUCENE-461)
|
||||||
|
|
||||||
13. FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and
|
13. FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and
|
||||||
FieldInfo.storePositionWithTermVector and creates the Field with
|
FieldInfo.storePositionWithTermVector and creates the Field with
|
||||||
|
|
|
@ -55,8 +55,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
case NUM:
|
case NUM:
|
||||||
token = jj_consume_token(NUM);
|
token = jj_consume_token(NUM);
|
||||||
break;
|
break;
|
||||||
case CJK:
|
case CJ:
|
||||||
token = jj_consume_token(CJK);
|
token = jj_consume_token(CJ);
|
||||||
break;
|
break;
|
||||||
case 0:
|
case 0:
|
||||||
token = jj_consume_token(0);
|
token = jj_consume_token(0);
|
||||||
|
@ -166,8 +166,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
|
|
||||||
public ParseException generateParseException() {
|
public ParseException generateParseException() {
|
||||||
jj_expentries.removeAllElements();
|
jj_expentries.removeAllElements();
|
||||||
boolean[] la1tokens = new boolean[15];
|
boolean[] la1tokens = new boolean[16];
|
||||||
for (int i = 0; i < 15; i++) {
|
for (int i = 0; i < 16; i++) {
|
||||||
la1tokens[i] = false;
|
la1tokens[i] = false;
|
||||||
}
|
}
|
||||||
if (jj_kind >= 0) {
|
if (jj_kind >= 0) {
|
||||||
|
@ -183,7 +183,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 15; i++) {
|
for (int i = 0; i < 16; i++) {
|
||||||
if (la1tokens[i]) {
|
if (la1tokens[i]) {
|
||||||
jj_expentry = new int[1];
|
jj_expentry = new int[1];
|
||||||
jj_expentry[0] = i;
|
jj_expentry[0] = i;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/**
|
/**f
|
||||||
* Copyright 2004 The Apache Software Foundation
|
* Copyright 2004 The Apache Software Foundation
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
@ -59,7 +59,7 @@ PARSER_END(StandardTokenizer)
|
||||||
TOKEN : { // token patterns
|
TOKEN : { // token patterns
|
||||||
|
|
||||||
// basic word: a sequence of digits & letters
|
// basic word: a sequence of digits & letters
|
||||||
<ALPHANUM: (<LETTER>|<DIGIT>)+ >
|
<ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
|
||||||
|
|
||||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||||
// use a post-filter to remove possesives
|
// use a post-filter to remove possesives
|
||||||
|
@ -106,16 +106,20 @@ TOKEN : { // token patterns
|
||||||
"\u0100"-"\u1fff"
|
"\u0100"-"\u1fff"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < CJK: // non-alphabets
|
| < CJ: // Chinese, Japanese
|
||||||
[
|
[
|
||||||
"\u3040"-"\u318f",
|
"\u3040"-"\u318f",
|
||||||
"\u3300"-"\u337f",
|
"\u3300"-"\u337f",
|
||||||
"\u3400"-"\u3d2d",
|
"\u3400"-"\u3d2d",
|
||||||
"\u4e00"-"\u9fff",
|
"\u4e00"-"\u9fff",
|
||||||
"\uac00"-"\ud7af",
|
|
||||||
"\uf900"-"\ufaff"
|
"\uf900"-"\ufaff"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
|
| < KOREAN: // Korean
|
||||||
|
[
|
||||||
|
"\uac00"-"\ud7af"
|
||||||
|
]
|
||||||
|
>
|
||||||
| < #DIGIT: // unicode digits
|
| < #DIGIT: // unicode digits
|
||||||
[
|
[
|
||||||
"\u0030"-"\u0039",
|
"\u0030"-"\u0039",
|
||||||
|
@ -157,7 +161,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
|
||||||
token = <EMAIL> |
|
token = <EMAIL> |
|
||||||
token = <HOST> |
|
token = <HOST> |
|
||||||
token = <NUM> |
|
token = <NUM> |
|
||||||
token = <CJK> |
|
token = <CJ> |
|
||||||
token = <EOF>
|
token = <EOF>
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
|
|
@ -15,9 +15,10 @@ public interface StandardTokenizerConstants {
|
||||||
int HAS_DIGIT = 9;
|
int HAS_DIGIT = 9;
|
||||||
int ALPHA = 10;
|
int ALPHA = 10;
|
||||||
int LETTER = 11;
|
int LETTER = 11;
|
||||||
int CJK = 12;
|
int CJ = 12;
|
||||||
int DIGIT = 13;
|
int KOREAN = 13;
|
||||||
int NOISE = 14;
|
int DIGIT = 14;
|
||||||
|
int NOISE = 15;
|
||||||
|
|
||||||
int DEFAULT = 0;
|
int DEFAULT = 0;
|
||||||
|
|
||||||
|
@ -34,7 +35,8 @@ public interface StandardTokenizerConstants {
|
||||||
"<HAS_DIGIT>",
|
"<HAS_DIGIT>",
|
||||||
"<ALPHA>",
|
"<ALPHA>",
|
||||||
"<LETTER>",
|
"<LETTER>",
|
||||||
"<CJK>",
|
"<CJ>",
|
||||||
|
"<KOREAN>",
|
||||||
"<DIGIT>",
|
"<DIGIT>",
|
||||||
"<NOISE>",
|
"<NOISE>",
|
||||||
};
|
};
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -94,5 +94,8 @@ public class TestStandardAnalyzer extends TestCase {
|
||||||
assertAnalyzesTo(a, "C++", new String[]{"c"});
|
assertAnalyzesTo(a, "C++", new String[]{"c"});
|
||||||
assertAnalyzesTo(a, "C#", new String[]{"c"});
|
assertAnalyzesTo(a, "C#", new String[]{"c"});
|
||||||
|
|
||||||
|
// Korean words
|
||||||
|
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue