mirror of https://github.com/apache/lucene.git
Fix StandardTokenizer's handling of CJK characters.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150159 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fb13af2325
commit
8016aa3ea5
|
@ -21,6 +21,11 @@ $Id$
|
|||
than the final token position. Position is used in phrase
|
||||
searching (see PhraseQuery and Token.setPositionIncrement()).
|
||||
|
||||
5. Fix StandardTokenizer's handling of CJK characters (Chinese,
|
||||
Japanese and Korean ideograms). Previously contiguous sequences
|
||||
were combined in a single token, which is not very useful. Now
|
||||
each ideogram generates a separate token, which is more useful.
|
||||
|
||||
|
||||
1.3 RC3
|
||||
|
||||
|
|
|
@ -47,8 +47,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
|||
case NUM:
|
||||
token = jj_consume_token(NUM);
|
||||
break;
|
||||
case SIGRAM:
|
||||
token = jj_consume_token(SIGRAM);
|
||||
case CJK:
|
||||
token = jj_consume_token(CJK);
|
||||
break;
|
||||
case 0:
|
||||
token = jj_consume_token(0);
|
||||
|
@ -79,7 +79,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
|||
jj_la1_0();
|
||||
}
|
||||
private static void jj_la1_0() {
|
||||
jj_la1_0 = new int[] {0x4ff,};
|
||||
jj_la1_0 = new int[] {0x10ff,};
|
||||
}
|
||||
|
||||
public StandardTokenizer(CharStream stream) {
|
||||
|
@ -158,8 +158,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
|||
|
||||
public ParseException generateParseException() {
|
||||
jj_expentries.removeAllElements();
|
||||
boolean[] la1tokens = new boolean[16];
|
||||
for (int i = 0; i < 16; i++) {
|
||||
boolean[] la1tokens = new boolean[15];
|
||||
for (int i = 0; i < 15; i++) {
|
||||
la1tokens[i] = false;
|
||||
}
|
||||
if (jj_kind >= 0) {
|
||||
|
@ -175,7 +175,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
|||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 16; i++) {
|
||||
for (int i = 0; i < 15; i++) {
|
||||
if (la1tokens[i]) {
|
||||
jj_expentry = new int[1];
|
||||
jj_expentry[0] = i;
|
||||
|
|
|
@ -125,7 +125,6 @@ TOKEN : { // token patterns
|
|||
(<LETTER>|<DIGIT>)*
|
||||
>
|
||||
|
||||
| < SIGRAM: (<CJK>)+ >
|
||||
| < #ALPHA: (<LETTER>)+>
|
||||
| < #LETTER: // unicode letters
|
||||
[
|
||||
|
@ -137,7 +136,7 @@ TOKEN : { // token patterns
|
|||
"\u0100"-"\u1fff"
|
||||
]
|
||||
>
|
||||
| < #CJK: // non-alphabets
|
||||
| < CJK: // non-alphabets
|
||||
[
|
||||
"\u3040"-"\u318f",
|
||||
"\u3300"-"\u337f",
|
||||
|
@ -187,7 +186,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
|
|||
token = <EMAIL> |
|
||||
token = <HOST> |
|
||||
token = <NUM> |
|
||||
token = <SIGRAM> |
|
||||
token = <CJK> |
|
||||
token = <EOF>
|
||||
)
|
||||
{
|
||||
|
|
|
@ -13,12 +13,11 @@ public interface StandardTokenizerConstants {
|
|||
int NUM = 7;
|
||||
int P = 8;
|
||||
int HAS_DIGIT = 9;
|
||||
int SIGRAM = 10;
|
||||
int ALPHA = 11;
|
||||
int LETTER = 12;
|
||||
int CJK = 13;
|
||||
int DIGIT = 14;
|
||||
int NOISE = 15;
|
||||
int ALPHA = 10;
|
||||
int LETTER = 11;
|
||||
int CJK = 12;
|
||||
int DIGIT = 13;
|
||||
int NOISE = 14;
|
||||
|
||||
int DEFAULT = 0;
|
||||
|
||||
|
@ -33,7 +32,6 @@ public interface StandardTokenizerConstants {
|
|||
"<NUM>",
|
||||
"<P>",
|
||||
"<HAS_DIGIT>",
|
||||
"<SIGRAM>",
|
||||
"<ALPHA>",
|
||||
"<LETTER>",
|
||||
"<CJK>",
|
||||
|
|
|
@ -8,7 +8,7 @@ public class StandardTokenizerTokenManager implements StandardTokenizerConstants
|
|||
public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
|
||||
private final int jjMoveStringLiteralDfa0_0()
|
||||
{
|
||||
return jjMoveNfa_0(1, 0);
|
||||
return jjMoveNfa_0(0, 0);
|
||||
}
|
||||
private final void jjCheckNAdd(int state)
|
||||
{
|
||||
|
@ -101,7 +101,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
|||
{
|
||||
switch(jjstateSet[--i])
|
||||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if ((0x3ff000000000000L & l) != 0L)
|
||||
{
|
||||
if (kind > 1)
|
||||
|
@ -111,6 +111,10 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
|||
if ((0x3ff000000000000L & l) != 0L)
|
||||
jjCheckNAddStates(18, 23);
|
||||
break;
|
||||
case 1:
|
||||
if ((0x3ff000000000000L & l) != 0L)
|
||||
jjCheckNAddStates(18, 23);
|
||||
break;
|
||||
case 2:
|
||||
case 39:
|
||||
if ((0x3ff000000000000L & l) != 0L)
|
||||
|
@ -380,7 +384,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
|||
{
|
||||
switch(jjstateSet[--i])
|
||||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if ((0x7fffffe07fffffeL & l) != 0L)
|
||||
jjCheckNAddStates(30, 35);
|
||||
if ((0x7fffffe07fffffeL & l) != 0L)
|
||||
|
@ -669,12 +673,11 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
|||
{
|
||||
switch(jjstateSet[--i])
|
||||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
{
|
||||
if (kind > 10)
|
||||
kind = 10;
|
||||
jjCheckNAdd(0);
|
||||
if (kind > 12)
|
||||
kind = 12;
|
||||
}
|
||||
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
|
||||
jjCheckNAddStates(18, 23);
|
||||
|
@ -687,12 +690,9 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
|||
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
|
||||
jjCheckNAddStates(30, 35);
|
||||
break;
|
||||
case 0:
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 10)
|
||||
kind = 10;
|
||||
jjCheckNAdd(0);
|
||||
case 1:
|
||||
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
|
||||
jjCheckNAddStates(18, 23);
|
||||
break;
|
||||
case 2:
|
||||
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
|
||||
|
@ -1021,15 +1021,15 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
|
|||
}
|
||||
public static final String[] jjstrLiteralImages = {
|
||||
"", null, null, null, null, null, null, null, null, null, null, null, null,
|
||||
null, null, null, };
|
||||
null, null, };
|
||||
public static final String[] lexStateNames = {
|
||||
"DEFAULT",
|
||||
};
|
||||
static final long[] jjtoToken = {
|
||||
0x4ffL,
|
||||
0x10ffL,
|
||||
};
|
||||
static final long[] jjtoSkip = {
|
||||
0x8000L,
|
||||
0x4000L,
|
||||
};
|
||||
protected CharStream input_stream;
|
||||
private final int[] jjrounds = new int[73];
|
||||
|
@ -1115,9 +1115,9 @@ public Token getNextToken()
|
|||
jjmatchedKind = 0x7fffffff;
|
||||
jjmatchedPos = 0;
|
||||
curPos = jjMoveStringLiteralDfa0_0();
|
||||
if (jjmatchedPos == 0 && jjmatchedKind > 15)
|
||||
if (jjmatchedPos == 0 && jjmatchedKind > 14)
|
||||
{
|
||||
jjmatchedKind = 15;
|
||||
jjmatchedKind = 14;
|
||||
}
|
||||
if (jjmatchedKind != 0x7fffffff)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue