mirror of https://github.com/apache/lucene.git
Fix StandardTokenizer's handling of CJK characters.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150159 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fb13af2325
commit
8016aa3ea5
|
@ -21,6 +21,11 @@ $Id$
|
||||||
than the final token position. Position is used in phrase
|
than the final token position. Position is used in phrase
|
||||||
searching (see PhraseQuery and Token.setPositionIncrement()).
|
searching (see PhraseQuery and Token.setPositionIncrement()).
|
||||||
|
|
||||||
|
5. Fix StandardTokenizer's handling of CJK characters (Chinese,
|
||||||
|
Japanese and Korean ideograms). Previously contiguous sequences
|
||||||
|
were combined in a single token, which is not very useful. Now
|
||||||
|
each ideogram generates a separate token, which is more useful.
|
||||||
|
|
||||||
|
|
||||||
1.3 RC3
|
1.3 RC3
|
||||||
|
|
||||||
|
|
|
@ -47,8 +47,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
case NUM:
|
case NUM:
|
||||||
token = jj_consume_token(NUM);
|
token = jj_consume_token(NUM);
|
||||||
break;
|
break;
|
||||||
case SIGRAM:
|
case CJK:
|
||||||
token = jj_consume_token(SIGRAM);
|
token = jj_consume_token(CJK);
|
||||||
break;
|
break;
|
||||||
case 0:
|
case 0:
|
||||||
token = jj_consume_token(0);
|
token = jj_consume_token(0);
|
||||||
|
@ -79,7 +79,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
jj_la1_0();
|
jj_la1_0();
|
||||||
}
|
}
|
||||||
private static void jj_la1_0() {
|
private static void jj_la1_0() {
|
||||||
jj_la1_0 = new int[] {0x4ff,};
|
jj_la1_0 = new int[] {0x10ff,};
|
||||||
}
|
}
|
||||||
|
|
||||||
public StandardTokenizer(CharStream stream) {
|
public StandardTokenizer(CharStream stream) {
|
||||||
|
@ -158,8 +158,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
|
|
||||||
public ParseException generateParseException() {
|
public ParseException generateParseException() {
|
||||||
jj_expentries.removeAllElements();
|
jj_expentries.removeAllElements();
|
||||||
boolean[] la1tokens = new boolean[16];
|
boolean[] la1tokens = new boolean[15];
|
||||||
for (int i = 0; i < 16; i++) {
|
for (int i = 0; i < 15; i++) {
|
||||||
la1tokens[i] = false;
|
la1tokens[i] = false;
|
||||||
}
|
}
|
||||||
if (jj_kind >= 0) {
|
if (jj_kind >= 0) {
|
||||||
|
@ -175,7 +175,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 16; i++) {
|
for (int i = 0; i < 15; i++) {
|
||||||
if (la1tokens[i]) {
|
if (la1tokens[i]) {
|
||||||
jj_expentry = new int[1];
|
jj_expentry = new int[1];
|
||||||
jj_expentry[0] = i;
|
jj_expentry[0] = i;
|
||||||
|
|
|
@ -125,7 +125,6 @@ TOKEN : { // token patterns
|
||||||
(<LETTER>|<DIGIT>)*
|
(<LETTER>|<DIGIT>)*
|
||||||
>
|
>
|
||||||
|
|
||||||
| < SIGRAM: (<CJK>)+ >
|
|
||||||
| < #ALPHA: (<LETTER>)+>
|
| < #ALPHA: (<LETTER>)+>
|
||||||
| < #LETTER: // unicode letters
|
| < #LETTER: // unicode letters
|
||||||
[
|
[
|
||||||
|
@ -137,7 +136,7 @@ TOKEN : { // token patterns
|
||||||
"\u0100"-"\u1fff"
|
"\u0100"-"\u1fff"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < #CJK: // non-alphabets
|
| < CJK: // non-alphabets
|
||||||
[
|
[
|
||||||
"\u3040"-"\u318f",
|
"\u3040"-"\u318f",
|
||||||
"\u3300"-"\u337f",
|
"\u3300"-"\u337f",
|
||||||
|
@ -187,7 +186,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
|
||||||
token = <EMAIL> |
|
token = <EMAIL> |
|
||||||
token = <HOST> |
|
token = <HOST> |
|
||||||
token = <NUM> |
|
token = <NUM> |
|
||||||
token = <SIGRAM> |
|
token = <CJK> |
|
||||||
token = <EOF>
|
token = <EOF>
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
|
|
@ -13,12 +13,11 @@ public interface StandardTokenizerConstants {
|
||||||
int NUM = 7;
|
int NUM = 7;
|
||||||
int P = 8;
|
int P = 8;
|
||||||
int HAS_DIGIT = 9;
|
int HAS_DIGIT = 9;
|
||||||
int SIGRAM = 10;
|
int ALPHA = 10;
|
||||||
int ALPHA = 11;
|
int LETTER = 11;
|
||||||
int LETTER = 12;
|
int CJK = 12;
|
||||||
int CJK = 13;
|
int DIGIT = 13;
|
||||||
int DIGIT = 14;
|
int NOISE = 14;
|
||||||
int NOISE = 15;
|
|
||||||
|
|
||||||
int DEFAULT = 0;
|
int DEFAULT = 0;
|
||||||
|
|
||||||
|
@ -33,7 +32,6 @@ public interface StandardTokenizerConstants {
|
||||||
"<NUM>",
|
"<NUM>",
|
||||||
"<P>",
|
"<P>",
|
||||||
"<HAS_DIGIT>",
|
"<HAS_DIGIT>",
|
||||||
"<SIGRAM>",
|
|
||||||
"<ALPHA>",
|
"<ALPHA>",
|
||||||
"<LETTER>",
|
"<LETTER>",
|
||||||
"<CJK>",
|
"<CJK>",
|
||||||
|
|
|
@ -8,7 +8,7 @@ public class StandardTokenizerTokenManager implements StandardTokenizerConstants
|
||||||
public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
|
public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
|
||||||
private final int jjMoveStringLiteralDfa0_0()
|
private final int jjMoveStringLiteralDfa0_0()
|
||||||
{
|
{
|
||||||
return jjMoveNfa_0(1, 0);
|
return jjMoveNfa_0(0, 0);
|
||||||
}
|
}
|
||||||
private final void jjCheckNAdd(int state)
|
private final void jjCheckNAdd(int state)
|
||||||
{
|
{
|
||||||
|
@ -101,7 +101,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
||||||
{
|
{
|
||||||
switch(jjstateSet[--i])
|
switch(jjstateSet[--i])
|
||||||
{
|
{
|
||||||
case 1:
|
case 0:
|
||||||
if ((0x3ff000000000000L & l) != 0L)
|
if ((0x3ff000000000000L & l) != 0L)
|
||||||
{
|
{
|
||||||
if (kind > 1)
|
if (kind > 1)
|
||||||
|
@ -111,6 +111,10 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
||||||
if ((0x3ff000000000000L & l) != 0L)
|
if ((0x3ff000000000000L & l) != 0L)
|
||||||
jjCheckNAddStates(18, 23);
|
jjCheckNAddStates(18, 23);
|
||||||
break;
|
break;
|
||||||
|
case 1:
|
||||||
|
if ((0x3ff000000000000L & l) != 0L)
|
||||||
|
jjCheckNAddStates(18, 23);
|
||||||
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
case 39:
|
case 39:
|
||||||
if ((0x3ff000000000000L & l) != 0L)
|
if ((0x3ff000000000000L & l) != 0L)
|
||||||
|
@ -380,7 +384,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
||||||
{
|
{
|
||||||
switch(jjstateSet[--i])
|
switch(jjstateSet[--i])
|
||||||
{
|
{
|
||||||
case 1:
|
case 0:
|
||||||
if ((0x7fffffe07fffffeL & l) != 0L)
|
if ((0x7fffffe07fffffeL & l) != 0L)
|
||||||
jjCheckNAddStates(30, 35);
|
jjCheckNAddStates(30, 35);
|
||||||
if ((0x7fffffe07fffffeL & l) != 0L)
|
if ((0x7fffffe07fffffeL & l) != 0L)
|
||||||
|
@ -669,12 +673,11 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
||||||
{
|
{
|
||||||
switch(jjstateSet[--i])
|
switch(jjstateSet[--i])
|
||||||
{
|
{
|
||||||
case 1:
|
case 0:
|
||||||
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
|
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||||
{
|
{
|
||||||
if (kind > 10)
|
if (kind > 12)
|
||||||
kind = 10;
|
kind = 12;
|
||||||
jjCheckNAdd(0);
|
|
||||||
}
|
}
|
||||||
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
|
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
|
||||||
jjCheckNAddStates(18, 23);
|
jjCheckNAddStates(18, 23);
|
||||||
|
@ -687,12 +690,9 @@ private final int jjMoveNfa_0(int startState, int curPos)
|
||||||
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
|
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
|
||||||
jjCheckNAddStates(30, 35);
|
jjCheckNAddStates(30, 35);
|
||||||
break;
|
break;
|
||||||
case 0:
|
case 1:
|
||||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
|
||||||
break;
|
jjCheckNAddStates(18, 23);
|
||||||
if (kind > 10)
|
|
||||||
kind = 10;
|
|
||||||
jjCheckNAdd(0);
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
|
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
|
||||||
|
@ -1021,15 +1021,15 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
|
||||||
}
|
}
|
||||||
public static final String[] jjstrLiteralImages = {
|
public static final String[] jjstrLiteralImages = {
|
||||||
"", null, null, null, null, null, null, null, null, null, null, null, null,
|
"", null, null, null, null, null, null, null, null, null, null, null, null,
|
||||||
null, null, null, };
|
null, null, };
|
||||||
public static final String[] lexStateNames = {
|
public static final String[] lexStateNames = {
|
||||||
"DEFAULT",
|
"DEFAULT",
|
||||||
};
|
};
|
||||||
static final long[] jjtoToken = {
|
static final long[] jjtoToken = {
|
||||||
0x4ffL,
|
0x10ffL,
|
||||||
};
|
};
|
||||||
static final long[] jjtoSkip = {
|
static final long[] jjtoSkip = {
|
||||||
0x8000L,
|
0x4000L,
|
||||||
};
|
};
|
||||||
protected CharStream input_stream;
|
protected CharStream input_stream;
|
||||||
private final int[] jjrounds = new int[73];
|
private final int[] jjrounds = new int[73];
|
||||||
|
@ -1115,9 +1115,9 @@ public Token getNextToken()
|
||||||
jjmatchedKind = 0x7fffffff;
|
jjmatchedKind = 0x7fffffff;
|
||||||
jjmatchedPos = 0;
|
jjmatchedPos = 0;
|
||||||
curPos = jjMoveStringLiteralDfa0_0();
|
curPos = jjMoveStringLiteralDfa0_0();
|
||||||
if (jjmatchedPos == 0 && jjmatchedKind > 15)
|
if (jjmatchedPos == 0 && jjmatchedKind > 14)
|
||||||
{
|
{
|
||||||
jjmatchedKind = 15;
|
jjmatchedKind = 14;
|
||||||
}
|
}
|
||||||
if (jjmatchedKind != 0x7fffffff)
|
if (jjmatchedKind != 0x7fffffff)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue