Fix StandardTokenizer's handling of CJK characters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150159 13f79535-47bb-0310-9956-ffa450edef68
2025-02-20 17:07:09 +00:00 · 2003-12-22 22:12:24 +00:00 · 2003-12-22 22:12:24 +00:00 · 8016aa3ea5
commit 8016aa3ea5
parent fb13af2325
5 changed files with 36 additions and 34 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -21,6 +21,11 @@ $Id$
    than the final token position.  Position is used in phrase
    searching (see PhraseQuery and Token.setPositionIncrement()).

+ 5. Fix StandardTokenizer's handling of CJK characters (Chinese,
+    Japanese and Korean ideograms).  Previously contiguous sequences
+    were combined in a single token, which is not very useful.  Now
+    each ideogram generates a separate token, which is more useful.
+

 1.3 RC3

--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -47,8 +47,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
    case NUM:
      token = jj_consume_token(NUM);
      break;
-    case SIGRAM:
-      token = jj_consume_token(SIGRAM);
+    case CJK:
+      token = jj_consume_token(CJK);
      break;
    case 0:
      token = jj_consume_token(0);
@ -79,7 +79,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
      jj_la1_0();
   }
   private static void jj_la1_0() {
-      jj_la1_0 = new int[] {0x4ff,};
+      jj_la1_0 = new int[] {0x10ff,};
   }

  public StandardTokenizer(CharStream stream) {
@ -158,8 +158,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl

  public ParseException generateParseException() {
    jj_expentries.removeAllElements();
-    boolean[] la1tokens = new boolean[16];
-    for (int i = 0; i < 16; i++) {
+    boolean[] la1tokens = new boolean[15];
+    for (int i = 0; i < 15; i++) {
      la1tokens[i] = false;
    }
    if (jj_kind >= 0) {
@ -175,7 +175,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
        }
      }
    }
-    for (int i = 0; i < 16; i++) {
+    for (int i = 0; i < 15; i++) {
      if (la1tokens[i]) {
        jj_expentry = new int[1];
        jj_expentry[0] = i;
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
@ -125,7 +125,6 @@ TOKEN : {					  // token patterns
    (<LETTER>|<DIGIT>)*
  >

-| < SIGRAM: (<CJK>)+ >
 | < #ALPHA: (<LETTER>)+>
 | < #LETTER:					  // unicode letters
      [
@ -137,7 +136,7 @@ TOKEN : {					  // token patterns
       "\u0100"-"\u1fff"
      ]
  >
-| < #CJK:             // non-alphabets
+| < CJK:                                          // non-alphabets
      [
       "\u3040"-"\u318f",
       "\u3300"-"\u337f",
@ -187,7 +186,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
    token = <EMAIL> |
    token = <HOST> |
    token = <NUM> |
-    token = <SIGRAM> |
+    token = <CJK> |
    token = <EOF>
   )
    {
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
@ -13,12 +13,11 @@ public interface StandardTokenizerConstants {
  int NUM = 7;
  int P = 8;
  int HAS_DIGIT = 9;
-  int SIGRAM = 10;
-  int ALPHA = 11;
-  int LETTER = 12;
-  int CJK = 13;
-  int DIGIT = 14;
-  int NOISE = 15;
+  int ALPHA = 10;
+  int LETTER = 11;
+  int CJK = 12;
+  int DIGIT = 13;
+  int NOISE = 14;

  int DEFAULT = 0;

@ -33,7 +32,6 @@ public interface StandardTokenizerConstants {
    "<NUM>",
    "<P>",
    "<HAS_DIGIT>",
-    "<SIGRAM>",
    "<ALPHA>",
    "<LETTER>",
    "<CJK>",
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
@ -8,7 +8,7 @@ public class StandardTokenizerTokenManager implements StandardTokenizerConstants
  public  void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
 private final int jjMoveStringLiteralDfa0_0()
 {
-   return jjMoveNfa_0(1, 0);
+   return jjMoveNfa_0(0, 0);
 }
 private final void jjCheckNAdd(int state)
 {
@ -101,7 +101,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
         {
            switch(jjstateSet[--i])
            {
-               case 1:
+               case 0:
                  if ((0x3ff000000000000L & l) != 0L)
                  {
                     if (kind > 1)
@ -111,6 +111,10 @@ private final int jjMoveNfa_0(int startState, int curPos)
                  if ((0x3ff000000000000L & l) != 0L)
                     jjCheckNAddStates(18, 23);
                  break;
+               case 1:
+                  if ((0x3ff000000000000L & l) != 0L)
+                     jjCheckNAddStates(18, 23);
+                  break;
               case 2:
               case 39:
                  if ((0x3ff000000000000L & l) != 0L)
@ -380,7 +384,7 @@ private final int jjMoveNfa_0(int startState, int curPos)
         {
            switch(jjstateSet[--i])
            {
-               case 1:
+               case 0:
                  if ((0x7fffffe07fffffeL & l) != 0L)
                     jjCheckNAddStates(30, 35);
                  if ((0x7fffffe07fffffeL & l) != 0L)
@ -669,12 +673,11 @@ private final int jjMoveNfa_0(int startState, int curPos)
         {
            switch(jjstateSet[--i])
            {
-               case 1:
+               case 0:
                  if (jjCanMove_0(hiByte, i1, i2, l1, l2))
                  {
-                     if (kind > 10)
-                        kind = 10;
-                     jjCheckNAdd(0);
+                     if (kind > 12)
+                        kind = 12;
                  }
                  if (jjCanMove_1(hiByte, i1, i2, l1, l2))
                     jjCheckNAddStates(18, 23);
@ -687,12 +690,9 @@ private final int jjMoveNfa_0(int startState, int curPos)
                  if (jjCanMove_2(hiByte, i1, i2, l1, l2))
                     jjCheckNAddStates(30, 35);
                  break;
-               case 0:
-                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
-                     break;
-                  if (kind > 10)
-                     kind = 10;
-                  jjCheckNAdd(0);
+               case 1:
+                  if (jjCanMove_1(hiByte, i1, i2, l1, l2))
+                     jjCheckNAddStates(18, 23);
                  break;
               case 2:
                  if (jjCanMove_2(hiByte, i1, i2, l1, l2))
@ -1021,15 +1021,15 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
 }
 public static final String[] jjstrLiteralImages = {
 "", null, null, null, null, null, null, null, null, null, null, null, null, 
-null, null, null, };
+null, null, };
 public static final String[] lexStateNames = {
   "DEFAULT", 
 };
 static final long[] jjtoToken = {
-   0x4ffL, 
+   0x10ffL, 
 };
 static final long[] jjtoSkip = {
-   0x8000L, 
+   0x4000L, 
 };
 protected CharStream input_stream;
 private final int[] jjrounds = new int[73];
@ -1115,9 +1115,9 @@ public Token getNextToken()
   jjmatchedKind = 0x7fffffff;
   jjmatchedPos = 0;
   curPos = jjMoveStringLiteralDfa0_0();
-   if (jjmatchedPos == 0 && jjmatchedKind > 15)
+   if (jjmatchedPos == 0 && jjmatchedKind > 14)
   {
-      jjmatchedKind = 15;
+      jjmatchedKind = 14;
   }
   if (jjmatchedKind != 0x7fffffff)
   {