#23466 - StandardTokenzier with CJK support(sigram)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150072 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2003-09-30 16:31:49 +00:00
parent 13b2aa2136
commit 186a1113cc
1 changed files with 8 additions and 2 deletions

View File

@ -56,7 +56,7 @@ options {
STATIC = false; STATIC = false;
//IGNORE_CASE = true; //IGNORE_CASE = true;
//BUILD_PARSER = false; //BUILD_PARSER = false;
//UNICODE_INPUT = true; UNICODE_INPUT = true;
USER_CHAR_STREAM = true; USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true; OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true; //DEBUG_TOKEN_MANAGER = true;
@ -125,6 +125,7 @@ TOKEN : { // token patterns
(<LETTER>|<DIGIT>)* (<LETTER>|<DIGIT>)*
> >
| < SIGRAM: (<CJK>)+ >
| < #ALPHA: (<LETTER>)+> | < #ALPHA: (<LETTER>)+>
| < #LETTER: // unicode letters | < #LETTER: // unicode letters
[ [
@ -133,7 +134,11 @@ TOKEN : { // token patterns
"\u00c0"-"\u00d6", "\u00c0"-"\u00d6",
"\u00d8"-"\u00f6", "\u00d8"-"\u00f6",
"\u00f8"-"\u00ff", "\u00f8"-"\u00ff",
"\u0100"-"\u1fff", "\u0100"-"\u1fff"
]
>
| < #CJK: // non-alphabets
[
"\u3040"-"\u318f", "\u3040"-"\u318f",
"\u3300"-"\u337f", "\u3300"-"\u337f",
"\u3400"-"\u3d2d", "\u3400"-"\u3d2d",
@ -182,6 +187,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
token = <EMAIL> | token = <EMAIL> |
token = <HOST> | token = <HOST> |
token = <NUM> | token = <NUM> |
token = <SIGRAM> |
token = <EOF> token = <EOF>
) )
{ {