LUCENE-461 - Fix for "StandardTokenizer splitting all of Korean words into separate characters", contributed by Cheolgoo Kang

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@332745 13f79535-47bb-0310-9956-ffa450edef68
2005-11-12 08:33:21 +00:00 · 2005-11-12 08:33:21 +00:00 · f00afeee7a
parent 1687a79648
commit f00afeee7a
6 changed files with 536 additions and 467 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -234,8 +234,11 @@ Bug fixes
    the original token.
    (Yonik Seeley via Erik Hatcher, LUCENE-437)

-12. Added Unicode range to fix tokenization of Korean.
-    (Otis, http://issues.apache.org/jira/browse/LUCENE-444)
+12. Added Unicode range of Korean characters to StandardTokenizer,
+    grouping contiguous characters into a token rather than one token
+    per character.  This change also changes the token type to "<CJ>"
+    for Chinese and Japanese character tokens (previously it was "<CJK>").
+    (Otis and Erik, via Cheolgoo Kang LUCENE-444 and LUCENE-461)

 13.	FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and
 	FieldInfo.storePositionWithTermVector and creates the Field with
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -55,8 +55,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
    case NUM:
      token = jj_consume_token(NUM);
      break;
-    case CJK:
-      token = jj_consume_token(CJK);
+    case CJ:
+      token = jj_consume_token(CJ);
      break;
    case 0:
      token = jj_consume_token(0);
@ -166,8 +166,8 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl

  public ParseException generateParseException() {
    jj_expentries.removeAllElements();
-    boolean[] la1tokens = new boolean[15];
-    for (int i = 0; i < 15; i++) {
+    boolean[] la1tokens = new boolean[16];
+    for (int i = 0; i < 16; i++) {
      la1tokens[i] = false;
    }
    if (jj_kind >= 0) {
@ -183,7 +183,7 @@ public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer impl
        }
      }
    }
-    for (int i = 0; i < 15; i++) {
+    for (int i = 0; i < 16; i++) {
      if (la1tokens[i]) {
        jj_expentry = new int[1];
        jj_expentry[0] = i;
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
@ -1,4 +1,4 @@
-/**
+/**f
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -59,7 +59,7 @@ PARSER_END(StandardTokenizer)
 TOKEN : {					  // token patterns

  // basic word: a sequence of digits & letters
-  <ALPHANUM: (<LETTER>|<DIGIT>)+ >
+  <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >

  // internal apostrophes: O'Reilly, you're, O'Reilly's
  // use a post-filter to remove possesives
@ -106,16 +106,20 @@ TOKEN : {					  // token patterns
       "\u0100"-"\u1fff"
      ]
  >
-| < CJK:                                          // non-alphabets
+| < CJ:                                          // Chinese, Japanese
      [
       "\u3040"-"\u318f",
       "\u3300"-"\u337f",
       "\u3400"-"\u3d2d",
       "\u4e00"-"\u9fff",
-       "\uac00"-"\ud7af",
       "\uf900"-"\ufaff"
      ]
  >
+| < KOREAN:                                          // Korean
+      [
+       "\uac00"-"\ud7af"
+      ]
+  >
 | < #DIGIT:					  // unicode digits
      [
       "\u0030"-"\u0039",
@ -157,7 +161,7 @@ org.apache.lucene.analysis.Token next() throws IOException :
    token = <EMAIL> |
    token = <HOST> |
    token = <NUM> |
-    token = <CJK> |
+    token = <CJ> |
    token = <EOF>
   )
    {
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
@ -15,9 +15,10 @@ public interface StandardTokenizerConstants {
  int HAS_DIGIT = 9;
  int ALPHA = 10;
  int LETTER = 11;
-  int CJK = 12;
-  int DIGIT = 13;
-  int NOISE = 14;
+  int CJ = 12;
+  int KOREAN = 13;
+  int DIGIT = 14;
+  int NOISE = 15;

  int DEFAULT = 0;

@ -34,7 +35,8 @@ public interface StandardTokenizerConstants {
    "<HAS_DIGIT>",
    "<ALPHA>",
    "<LETTER>",
-    "<CJK>",
+    "<CJ>",
+    "<KOREAN>",
    "<DIGIT>",
    "<NOISE>",
  };
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
--- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
@ -94,5 +94,8 @@ public class TestStandardAnalyzer extends TestCase {
    assertAnalyzesTo(a, "C++", new String[]{"c"});
    assertAnalyzesTo(a, "C#", new String[]{"c"});

+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+
  }
 }