Fix StandardTokenizer's handling of CJK characters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150159 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doug Cutting 2003-12-22 22:12:24 +00:00
parent fb13af2325
commit 8016aa3ea5
5 changed files with 36 additions and 34 deletions

View File

@ -21,6 +21,11 @@ $Id$
than the final token position. Position is used in phrase
searching (see PhraseQuery and Token.setPositionIncrement()).
5. Fix StandardTokenizer's handling of CJK characters (Chinese,
Japanese and Korean ideograms). Previously contiguous sequences
were combined in a single token, which is not very useful. Now
each ideogram generates a separate token, which is more useful.
1.3 RC3

View File

@ -47,8 +47,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
case NUM:
token = jj_consume_token(NUM);
break;
case SIGRAM:
token = jj_consume_token(SIGRAM);
case CJK:
token = jj_consume_token(CJK);
break;
case 0:
token = jj_consume_token(0);
@ -79,7 +79,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
jj_la1_0();
}
private static void jj_la1_0() {
jj_la1_0 = new int[] {0x4ff,};
jj_la1_0 = new int[] {0x10ff,};
}
public StandardTokenizer(CharStream stream) {
@ -158,8 +158,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
public ParseException generateParseException() {
jj_expentries.removeAllElements();
boolean[] la1tokens = new boolean[16];
for (int i = 0; i < 16; i++) {
boolean[] la1tokens = new boolean[15];
for (int i = 0; i < 15; i++) {
la1tokens[i] = false;
}
if (jj_kind >= 0) {
@ -175,7 +175,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
}
}
}
for (int i = 0; i < 16; i++) {
for (int i = 0; i < 15; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;

View File

@ -125,7 +125,6 @@ TOKEN : { // token patterns
(<LETTER>|<DIGIT>)*
>
| < SIGRAM: (<CJK>)+ >
| < #ALPHA: (<LETTER>)+>
| < #LETTER: // unicode letters
[
@ -137,7 +136,7 @@ TOKEN : { // token patterns
"\u0100"-"\u1fff"
]
>
| < #CJK: // non-alphabets
| < CJK: // non-alphabets
[
"\u3040"-"\u318f",
"\u3300"-"\u337f",
@ -187,7 +186,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
token = <EMAIL> |
token = <HOST> |
token = <NUM> |
token = <SIGRAM> |
token = <CJK> |
token = <EOF>
)
{

View File

@ -13,12 +13,11 @@ public interface StandardTokenizerConstants {
int NUM = 7;
int P = 8;
int HAS_DIGIT = 9;
int SIGRAM = 10;
int ALPHA = 11;
int LETTER = 12;
int CJK = 13;
int DIGIT = 14;
int NOISE = 15;
int ALPHA = 10;
int LETTER = 11;
int CJK = 12;
int DIGIT = 13;
int NOISE = 14;
int DEFAULT = 0;
@ -33,7 +32,6 @@ public interface StandardTokenizerConstants {
"<NUM>",
"<P>",
"<HAS_DIGIT>",
"<SIGRAM>",
"<ALPHA>",
"<LETTER>",
"<CJK>",

View File

@ -8,7 +8,7 @@ public class StandardTokenizerTokenManager implements StandardTokenizerConstants
public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
private final int jjMoveStringLiteralDfa0_0()
{
return jjMoveNfa_0(1, 0);
return jjMoveNfa_0(0, 0);
}
private final void jjCheckNAdd(int state)
{
@ -101,7 +101,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
{
switch(jjstateSet[--i])
{
case 1:
case 0:
if ((0x3ff000000000000L & l) != 0L)
{
if (kind > 1)
@ -111,6 +111,10 @@ private final int jjMoveNfa_0(int startState, int curPos)
if ((0x3ff000000000000L & l) != 0L)
jjCheckNAddStates(18, 23);
break;
case 1:
if ((0x3ff000000000000L & l) != 0L)
jjCheckNAddStates(18, 23);
break;
case 2:
case 39:
if ((0x3ff000000000000L & l) != 0L)
@ -380,7 +384,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
{
switch(jjstateSet[--i])
{
case 1:
case 0:
if ((0x7fffffe07fffffeL & l) != 0L)
jjCheckNAddStates(30, 35);
if ((0x7fffffe07fffffeL & l) != 0L)
@ -669,12 +673,11 @@ private final int jjMoveNfa_0(int startState, int curPos)
{
switch(jjstateSet[--i])
{
case 1:
case 0:
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
{
if (kind > 10)
kind = 10;
jjCheckNAdd(0);
if (kind > 12)
kind = 12;
}
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(18, 23);
@ -687,12 +690,9 @@ private final int jjMoveNfa_0(int startState, int curPos)
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(30, 35);
break;
case 0:
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 10)
kind = 10;
jjCheckNAdd(0);
case 1:
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(18, 23);
break;
case 2:
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
@ -1021,15 +1021,15 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
}
public static final String[] jjstrLiteralImages = {
"", null, null, null, null, null, null, null, null, null, null, null, null,
null, null, null, };
null, null, };
public static final String[] lexStateNames = {
"DEFAULT",
};
static final long[] jjtoToken = {
0x4ffL,
0x10ffL,
};
static final long[] jjtoSkip = {
0x8000L,
0x4000L,
};
protected CharStream input_stream;
private final int[] jjrounds = new int[73];
@ -1115,9 +1115,9 @@ public Token getNextToken()
jjmatchedKind = 0x7fffffff;
jjmatchedPos = 0;
curPos = jjMoveStringLiteralDfa0_0();
if (jjmatchedPos == 0 && jjmatchedKind > 15)
if (jjmatchedPos == 0 && jjmatchedKind > 14)
{
jjmatchedKind = 15;
jjmatchedKind = 14;
}
if (jjmatchedKind != 0x7fffffff)
{