LUCENE-461 - Fix for "StandardTokenizer splitting all of Korean words into separate characters", contributed by Cheolgoo Kang

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332745 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-11-12 08:33:21 +00:00
parent 1687a79648
commit f00afeee7a
6 changed files with 536 additions and 467 deletions

View File

@ -234,8 +234,11 @@ Bug fixes
the original token.
(Yonik Seeley via Erik Hatcher, LUCENE-437)
12. Added Unicode range to fix tokenization of Korean.
(Otis, http://issues.apache.org/jira/browse/LUCENE-444)
12. Added Unicode range of Korean characters to StandardTokenizer,
grouping contiguous characters into a token rather than one token
per character. This change also changes the token type to "<CJ>"
for Chinese and Japanese character tokens (previously it was "<CJK>").
(Otis and Erik, via Cheolgoo Kang LUCENE-444 and LUCENE-461)
13. FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and
FieldInfo.storePositionWithTermVector and creates the Field with

View File

@ -55,8 +55,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
case NUM:
token = jj_consume_token(NUM);
break;
case CJK:
token = jj_consume_token(CJK);
case CJ:
token = jj_consume_token(CJ);
break;
case 0:
token = jj_consume_token(0);
@ -166,8 +166,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
public ParseException generateParseException() {
jj_expentries.removeAllElements();
boolean[] la1tokens = new boolean[15];
for (int i = 0; i < 15; i++) {
boolean[] la1tokens = new boolean[16];
for (int i = 0; i < 16; i++) {
la1tokens[i] = false;
}
if (jj_kind >= 0) {
@ -183,7 +183,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
}
}
}
for (int i = 0; i < 15; i++) {
for (int i = 0; i < 16; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;

View File

@ -1,4 +1,4 @@
/**
/**f
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -59,7 +59,7 @@ PARSER_END(StandardTokenizer)
TOKEN : { // token patterns
// basic word: a sequence of digits & letters
<ALPHANUM: (<LETTER>|<DIGIT>)+ >
<ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
@ -106,16 +106,20 @@ TOKEN : { // token patterns
"\u0100"-"\u1fff"
]
>
| < CJK: // non-alphabets
| < CJ: // Chinese, Japanese
[
"\u3040"-"\u318f",
"\u3300"-"\u337f",
"\u3400"-"\u3d2d",
"\u4e00"-"\u9fff",
"\uac00"-"\ud7af",
"\uf900"-"\ufaff"
]
>
| < KOREAN: // Korean
[
"\uac00"-"\ud7af"
]
>
| < #DIGIT: // unicode digits
[
"\u0030"-"\u0039",
@ -157,7 +161,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
token = <EMAIL> |
token = <HOST> |
token = <NUM> |
token = <CJK> |
token = <CJ> |
token = <EOF>
)
{

View File

@ -15,9 +15,10 @@ public interface StandardTokenizerConstants {
int HAS_DIGIT = 9;
int ALPHA = 10;
int LETTER = 11;
int CJK = 12;
int DIGIT = 13;
int NOISE = 14;
int CJ = 12;
int KOREAN = 13;
int DIGIT = 14;
int NOISE = 15;
int DEFAULT = 0;
@ -34,7 +35,8 @@ public interface StandardTokenizerConstants {
"<HAS_DIGIT>",
"<ALPHA>",
"<LETTER>",
"<CJK>",
"<CJ>",
"<KOREAN>",
"<DIGIT>",
"<NOISE>",
};

View File

@ -94,5 +94,8 @@ public class TestStandardAnalyzer extends TestCase {
assertAnalyzesTo(a, "C++", new String[]{"c"});
assertAnalyzesTo(a, "C#", new String[]{"c"});
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
}