mirror of https://github.com/apache/lucene.git
#23466 - StandardTokenzier with CJK support(sigram)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150072 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
13b2aa2136
commit
186a1113cc
|
@ -56,7 +56,7 @@ options {
|
||||||
STATIC = false;
|
STATIC = false;
|
||||||
//IGNORE_CASE = true;
|
//IGNORE_CASE = true;
|
||||||
//BUILD_PARSER = false;
|
//BUILD_PARSER = false;
|
||||||
//UNICODE_INPUT = true;
|
UNICODE_INPUT = true;
|
||||||
USER_CHAR_STREAM = true;
|
USER_CHAR_STREAM = true;
|
||||||
OPTIMIZE_TOKEN_MANAGER = true;
|
OPTIMIZE_TOKEN_MANAGER = true;
|
||||||
//DEBUG_TOKEN_MANAGER = true;
|
//DEBUG_TOKEN_MANAGER = true;
|
||||||
|
@ -125,6 +125,7 @@ TOKEN : { // token patterns
|
||||||
(<LETTER>|<DIGIT>)*
|
(<LETTER>|<DIGIT>)*
|
||||||
>
|
>
|
||||||
|
|
||||||
|
| < SIGRAM: (<CJK>)+ >
|
||||||
| < #ALPHA: (<LETTER>)+>
|
| < #ALPHA: (<LETTER>)+>
|
||||||
| < #LETTER: // unicode letters
|
| < #LETTER: // unicode letters
|
||||||
[
|
[
|
||||||
|
@ -133,7 +134,11 @@ TOKEN : { // token patterns
|
||||||
"\u00c0"-"\u00d6",
|
"\u00c0"-"\u00d6",
|
||||||
"\u00d8"-"\u00f6",
|
"\u00d8"-"\u00f6",
|
||||||
"\u00f8"-"\u00ff",
|
"\u00f8"-"\u00ff",
|
||||||
"\u0100"-"\u1fff",
|
"\u0100"-"\u1fff"
|
||||||
|
]
|
||||||
|
>
|
||||||
|
| < #CJK: // non-alphabets
|
||||||
|
[
|
||||||
"\u3040"-"\u318f",
|
"\u3040"-"\u318f",
|
||||||
"\u3300"-"\u337f",
|
"\u3300"-"\u337f",
|
||||||
"\u3400"-"\u3d2d",
|
"\u3400"-"\u3d2d",
|
||||||
|
@ -182,6 +187,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
|
||||||
token = <EMAIL> |
|
token = <EMAIL> |
|
||||||
token = <HOST> |
|
token = <HOST> |
|
||||||
token = <NUM> |
|
token = <NUM> |
|
||||||
|
token = <SIGRAM> |
|
||||||
token = <EOF>
|
token = <EOF>
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue