Fix StandardTokenizer's handling of CJK characters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150159 13f79535-47bb-0310-9956-ffa450edef68
2003-12-22 22:12:24 +00:00 · 2003-12-22 22:12:24 +00:00 · 8016aa3ea5
parent fb13af2325
commit 8016aa3ea5
5 changed files with 36 additions and 34 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -21,6 +21,11 @@ $Id$
    than the final token position.  Position is used in phrase
    searching (see PhraseQuery and Token.setPositionIncrement()).
 5. Fix StandardTokenizer's handling of CJK characters (Chinese,
    Japanese and Korean ideograms).  Previously contiguous sequences
    were combined in a single token, which is not very useful.  Now
    each ideogram generates a separate token, which is more useful.
 1.3 RC3
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -47,8 +47,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
    case NUM:
      token = jj_consume_token(NUM);
      break;
-    case SIGRAM:
+    case CJK:
-      token = jj_consume_token(SIGRAM);
+      token = jj_consume_token(CJK);
      break;
    case 0:
      token = jj_consume_token(0);
@ -79,7 +79,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
      jj_la1_0();
   }
   private static void jj_la1_0() {
-      jj_la1_0 = new int[] {0x4ff,};
+      jj_la1_0 = new int[] {0x10ff,};
   }
  public StandardTokenizer(CharStream stream) {
@ -158,8 +158,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
  public ParseException generateParseException() {
    jj_expentries.removeAllElements();
-    boolean[] la1tokens = new boolean[16];
+    boolean[] la1tokens = new boolean[15];
-    for (int i = 0; i < 16; i++) {
+    for (int i = 0; i < 15; i++) {
      la1tokens[i] = false;
    }
    if (jj_kind >= 0) {
@ -175,7 +175,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
        }
      }
    }
-    for (int i = 0; i < 16; i++) {
+    for (int i = 0; i < 15; i++) {
      if (la1tokens[i]) {
        jj_expentry = new int[1];
        jj_expentry[0] = i;
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
@ -125,7 +125,6 @@ TOKEN : {					  // token patterns
    (<LETTER>|<DIGIT>)*
  >
 | < SIGRAM: (<CJK>)+ >
 | < #ALPHA: (<LETTER>)+>
 | < #LETTER:					  // unicode letters
      [
@ -137,7 +136,7 @@ TOKEN : {					  // token patterns
       "\u0100"-"\u1fff"
      ]
  >
-| < #CJK:             // non-alphabets
+| < CJK:                                          // non-alphabets
      [
       "\u3040"-"\u318f",
       "\u3300"-"\u337f",
@ -187,7 +186,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
    token = <EMAIL> |
    token = <HOST> |
    token = <NUM> |
-    token = <SIGRAM> |
+    token = <CJK> |
    token = <EOF>
   )
    {
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
@ -13,12 +13,11 @@ public interface StandardTokenizerConstants {
  int NUM = 7;
  int P = 8;
  int HAS_DIGIT = 9;
-  int SIGRAM = 10;
+  int ALPHA = 10;
-  int ALPHA = 11;
+  int LETTER = 11;
-  int LETTER = 12;
+  int CJK = 12;
-  int CJK = 13;
+  int DIGIT = 13;
-  int DIGIT = 14;
+  int NOISE = 14;
  int NOISE = 15;
  int DEFAULT = 0;
@ -33,7 +32,6 @@ public interface StandardTokenizerConstants {
    "<NUM>",
    "<P>",
    "<HAS_DIGIT>",
    "<SIGRAM>",
    "<ALPHA>",
    "<LETTER>",
    "<CJK>",
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
@ -8,7 +8,7 @@ public class StandardTokenizerTokenManager implements StandardTokenizerConstants
  public  void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
 private final int jjMoveStringLiteralDfa0_0()
 {
-   return jjMoveNfa_0(1, 0);
+   return jjMoveNfa_0(0, 0);
 }
 private final void jjCheckNAdd(int state)
 {
@ -101,7 +101,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
         {
            switch(jjstateSet[--i])
            {
-               case 1:
+               case 0:
                  if ((0x3ff000000000000L & l) != 0L)
                  {
                     if (kind > 1)
@ -111,6 +111,10 @@ private final int jjMoveNfa_0(int startState, int curPos)
                  if ((0x3ff000000000000L & l) != 0L)
                     jjCheckNAddStates(18, 23);
                  break;
               case 1:
                  if ((0x3ff000000000000L & l) != 0L)
                     jjCheckNAddStates(18, 23);
                  break;
               case 2:
               case 39:
                  if ((0x3ff000000000000L & l) != 0L)
@ -380,7 +384,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
         {
            switch(jjstateSet[--i])
            {
-               case 1:
+               case 0:
                  if ((0x7fffffe07fffffeL & l) != 0L)
                     jjCheckNAddStates(30, 35);
                  if ((0x7fffffe07fffffeL & l) != 0L)
@ -669,12 +673,11 @@ private final int jjMoveNfa_0(int startState, int curPos)
         {
            switch(jjstateSet[--i])
            {
-               case 1:
+               case 0:
                  if (jjCanMove_0(hiByte, i1, i2, l1, l2))
                  {
-                     if (kind > 10)
+                     if (kind > 12)
-                        kind = 10;
+                        kind = 12;
                     jjCheckNAdd(0);
                  }
                  if (jjCanMove_1(hiByte, i1, i2, l1, l2))
                     jjCheckNAddStates(18, 23);
@ -687,12 +690,9 @@ private final int jjMoveNfa_0(int startState, int curPos)
                  if (jjCanMove_2(hiByte, i1, i2, l1, l2))
                     jjCheckNAddStates(30, 35);
                  break;
-               case 0:
+               case 1:
-                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
+                  if (jjCanMove_1(hiByte, i1, i2, l1, l2))
-                     break;
+                     jjCheckNAddStates(18, 23);
                  if (kind > 10)
                     kind = 10;
                  jjCheckNAdd(0);
                  break;
               case 2:
                  if (jjCanMove_2(hiByte, i1, i2, l1, l2))
@ -1021,15 +1021,15 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
 }
 public static final String[] jjstrLiteralImages = {
 "", null, null, null, null, null, null, null, null, null, null, null, null, 
-null, null, null, };
+null, null, };
 public static final String[] lexStateNames = {
   "DEFAULT", 
 };
 static final long[] jjtoToken = {
-   0x4ffL, 
+   0x10ffL, 
 };
 static final long[] jjtoSkip = {
-   0x8000L, 
+   0x4000L, 
 };
 protected CharStream input_stream;
 private final int[] jjrounds = new int[73];
@ -1115,9 +1115,9 @@ public Token getNextToken()
   jjmatchedKind = 0x7fffffff;
   jjmatchedPos = 0;
   curPos = jjMoveStringLiteralDfa0_0();
-   if (jjmatchedPos == 0 && jjmatchedKind > 15)
+   if (jjmatchedPos == 0 && jjmatchedKind > 14)
   {
-      jjmatchedKind = 15;
+      jjmatchedKind = 14;
   }
   if (jjmatchedKind != 0x7fffffff)
   {