LUCENE-461 - Fix for "StandardTokenizer splitting all of Korean words into separate characters", contributed by Cheolgoo Kang

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332745 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-11-12 08:33:21 +00:00
parent 1687a79648
commit f00afeee7a
6 changed files with 536 additions and 467 deletions

View File

@ -234,8 +234,11 @@ Bug fixes
the original token. the original token.
(Yonik Seeley via Erik Hatcher, LUCENE-437) (Yonik Seeley via Erik Hatcher, LUCENE-437)
12. Added Unicode range to fix tokenization of Korean. 12. Added Unicode range of Korean characters to StandardTokenizer,
(Otis, http://issues.apache.org/jira/browse/LUCENE-444) grouping contiguous characters into a token rather than one token
per character. This change also changes the token type to "<CJ>"
for Chinese and Japanese character tokens (previously it was "<CJK>").
(Otis and Erik, via Cheolgoo Kang LUCENE-444 and LUCENE-461)
13. FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and 13. FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and
FieldInfo.storePositionWithTermVector and creates the Field with FieldInfo.storePositionWithTermVector and creates the Field with

View File

@ -55,8 +55,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
case NUM: case NUM:
token = jj_consume_token(NUM); token = jj_consume_token(NUM);
break; break;
case CJK: case CJ:
token = jj_consume_token(CJK); token = jj_consume_token(CJ);
break; break;
case 0: case 0:
token = jj_consume_token(0); token = jj_consume_token(0);
@ -166,8 +166,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
public ParseException generateParseException() { public ParseException generateParseException() {
jj_expentries.removeAllElements(); jj_expentries.removeAllElements();
boolean[] la1tokens = new boolean[15]; boolean[] la1tokens = new boolean[16];
for (int i = 0; i < 15; i++) { for (int i = 0; i < 16; i++) {
la1tokens[i] = false; la1tokens[i] = false;
} }
if (jj_kind >= 0) { if (jj_kind >= 0) {
@ -183,7 +183,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
} }
} }
} }
for (int i = 0; i < 15; i++) { for (int i = 0; i < 16; i++) {
if (la1tokens[i]) { if (la1tokens[i]) {
jj_expentry = new int[1]; jj_expentry = new int[1];
jj_expentry[0] = i; jj_expentry[0] = i;

View File

@ -1,4 +1,4 @@
/** /**f
* Copyright 2004 The Apache Software Foundation * Copyright 2004 The Apache Software Foundation
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -59,7 +59,7 @@ PARSER_END(StandardTokenizer)
TOKEN : { // token patterns TOKEN : { // token patterns
// basic word: a sequence of digits & letters // basic word: a sequence of digits & letters
<ALPHANUM: (<LETTER>|<DIGIT>)+ > <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
// internal apostrophes: O'Reilly, you're, O'Reilly's // internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives // use a post-filter to remove possesives
@ -106,16 +106,20 @@ TOKEN : { // token patterns
"\u0100"-"\u1fff" "\u0100"-"\u1fff"
] ]
> >
| < CJK: // non-alphabets | < CJ: // Chinese, Japanese
[ [
"\u3040"-"\u318f", "\u3040"-"\u318f",
"\u3300"-"\u337f", "\u3300"-"\u337f",
"\u3400"-"\u3d2d", "\u3400"-"\u3d2d",
"\u4e00"-"\u9fff", "\u4e00"-"\u9fff",
"\uac00"-"\ud7af",
"\uf900"-"\ufaff" "\uf900"-"\ufaff"
] ]
> >
| < KOREAN: // Korean
[
"\uac00"-"\ud7af"
]
>
| < #DIGIT: // unicode digits | < #DIGIT: // unicode digits
[ [
"\u0030"-"\u0039", "\u0030"-"\u0039",
@ -157,7 +161,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
token = <EMAIL> | token = <EMAIL> |
token = <HOST> | token = <HOST> |
token = <NUM> | token = <NUM> |
token = <CJK> | token = <CJ> |
token = <EOF> token = <EOF>
) )
{ {

View File

@ -15,9 +15,10 @@ public interface StandardTokenizerConstants {
int HAS_DIGIT = 9; int HAS_DIGIT = 9;
int ALPHA = 10; int ALPHA = 10;
int LETTER = 11; int LETTER = 11;
int CJK = 12; int CJ = 12;
int DIGIT = 13; int KOREAN = 13;
int NOISE = 14; int DIGIT = 14;
int NOISE = 15;
int DEFAULT = 0; int DEFAULT = 0;
@ -34,7 +35,8 @@ public interface StandardTokenizerConstants {
"<HAS_DIGIT>", "<HAS_DIGIT>",
"<ALPHA>", "<ALPHA>",
"<LETTER>", "<LETTER>",
"<CJK>", "<CJ>",
"<KOREAN>",
"<DIGIT>", "<DIGIT>",
"<NOISE>", "<NOISE>",
}; };

View File

@ -94,5 +94,8 @@ public class TestStandardAnalyzer extends TestCase {
assertAnalyzesTo(a, "C++", new String[]{"c"}); assertAnalyzesTo(a, "C++", new String[]{"c"});
assertAnalyzesTo(a, "C#", new String[]{"c"}); assertAnalyzesTo(a, "C#", new String[]{"c"});
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
} }
} }