LUCENE-973: add test case for CJKAnalyzer; fix trailing empty string bug

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@785287 13f79535-47bb-0310-9956-ffa450edef68
2009-06-16 16:38:39 +00:00 · 2009-06-16 16:38:39 +00:00 · 835c405be0
parent 2f7b9df887
commit 835c405be0
2 changed files with 190 additions and 14 deletions
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@ -37,7 +37,18 @@ import java.io.Reader;
 */
 public final class CJKTokenizer extends Tokenizer {
    //~ Static fields/initializers ---------------------------------------------
+    /** Word token type */
+    static final int WORD_TYPE = 0;
+  
+    /** Single byte token type */
+    static final int SINGLE_TOKEN_TYPE = 1;

+    /** Double byte token type */
+    static final int DOUBLE_TOKEN_TYPE = 2;
+  
+    /** Names for token types */
+    static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+  
    /** Max word length */
    private static final int MAX_WORD_LEN = 255;

@ -68,7 +79,7 @@ public final class CJKTokenizer extends Tokenizer {
    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

    /** word type: single=>ASCII  double=>non-ASCII word=>default */
-    private String tokenType = "word";
+    private int tokenType = WORD_TYPE;

    /**
     * tag: previous character is a cached double-byte character  "C1C2C3C4"
@ -105,12 +116,15 @@ public final class CJKTokenizer extends Tokenizer {
    public final Token next(final Token reusableToken) throws java.io.IOException {
        /** how many character(s) has been stored in buffer */
        assert reusableToken != null;
-        int length = 0;

-        /** the position used to create Token */
-        int start = offset;
+        while(true) { // loop until we find a non-empty token

-        while (true) {
+          int length = 0;
+
+          /** the position used to create Token */
+          int start = offset;
+
+          while (true) { // loop until we've found a full token
            /** current character */
            char c;

@ -150,7 +164,7 @@ public final class CJKTokenizer extends Tokenizer {
                if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
                  int i = (int) c;
                  if (i >= 65281 && i <= 65374) {
-                    /** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+                    // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
                    i = i - 65248;
                    c = (char) i;
                  }
@ -165,19 +179,17 @@ public final class CJKTokenizer extends Tokenizer {
                        //      ^--: the current character begin to token the ASCII
                        // letter
                        start = offset - 1;
-                    } else if (tokenType == "double") {
+                    } else if (tokenType == DOUBLE_TOKEN_TYPE) {
                        // "javaC1C2C3C4linux" <br>
                        //              ^--: the previous non-ASCII
                        // : the current character
                        offset--;
                        bufferIndex--;
-                        tokenType = "single";

                        if (preIsTokened == true) {
                            // there is only one non-ASCII has been stored
                            length = 0;
                            preIsTokened = false;
-
                            break;
                        } else {
                            break;
@ -186,7 +198,7 @@ public final class CJKTokenizer extends Tokenizer {

                    // store the LowerCase(c) in the buffer
                    buffer[length++] = Character.toLowerCase(c);
-                    tokenType = "single";
+                    tokenType = SINGLE_TOKEN_TYPE;

                    // break the procedure if buffer overflowed!
                    if (length == MAX_WORD_LEN) {
@ -206,9 +218,9 @@ public final class CJKTokenizer extends Tokenizer {
                    if (length == 0) {
                        start = offset - 1;
                        buffer[length++] = c;
-                        tokenType = "double";
+                        tokenType = DOUBLE_TOKEN_TYPE;
                    } else {
-                        if (tokenType == "single") {
+                      if (tokenType == SINGLE_TOKEN_TYPE) {
                            offset--;
                            bufferIndex--;

@ -216,7 +228,7 @@ public final class CJKTokenizer extends Tokenizer {
                            break;
                        } else {
                            buffer[length++] = c;
-                            tokenType = "double";
+                            tokenType = DOUBLE_TOKEN_TYPE;

                            if (length == 2) {
                                offset--;
@ -238,7 +250,16 @@ public final class CJKTokenizer extends Tokenizer {
                }
            }
        }
+      
+        if (length > 0) {
+            return reusableToken.reinit
+                (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
+        } else if (dataLen == -1) {
+          return null;
+        }

-        return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
+        // Cycle back and try for the next token (don't
+        // return an empty string)
+      }
    }
 }
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
@ -0,0 +1,155 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+
+
+public class TestCJKTokenizer extends TestCase{
+
+  public Token newToken(String termText, int start, int end, int type) {
+    Token token = new Token(start, end);
+    token.setTermBuffer(termText);
+    token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
+    return token;
+  }
+
+  public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
+    CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
+    int i = 0;
+    System.out.println("string[" + str + "]");
+    System.out.print("tokens[");
+    final Token reusableToken = new Token();
+    for (Token token = tokenizer.next(reusableToken) ;
+         token != null                               ; 
+         token = tokenizer.next(reusableToken)       ) {
+      if (token.term().equals(out_tokens[i].term()) 
+          && token.startOffset() == out_tokens[i].startOffset() 
+          && token.endOffset() == out_tokens[i].endOffset() 
+          && token.type().equals(out_tokens[i].type()) ) {
+        System.out.print( token.term() + " ");
+      }
+      else {
+        fail(token.term() + " (start: " + token.startOffset() 
+             + " end: " + token.endOffset() + " type: " + token.type() + ") != "
+             + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset() 
+             + " end: " + out_tokens[i].endOffset() 
+             + " type: " + out_tokens[i].type() + ")");
+        break;
+      }
+      ++i;
+    }
+    System.out.println("]" + System.getProperty("line.separator"));
+  }
+  
+  public void testJa1() throws IOException {
+    String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
+       
+    Token[] out_tokens = { 
+      newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testJa2() throws IOException {
+    String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
+       
+    Token[] out_tokens = { 
+      newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testC() throws IOException {
+    String str = "abc defgh ijklmn opqrstu vwxy z";
+       
+    Token[] out_tokens = { 
+      newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
+      newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
+      newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testMix() throws IOException {
+    String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
+       
+    Token[] out_tokens = { 
+      newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testMix2() throws IOException {
+    String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
+       
+    Token[] out_tokens = { 
+      newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+
+  public void testSingleChar() throws IOException {
+    String str = "\u4e00";
+       
+    Token[] out_tokens = { 
+      newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+    };
+    checkCJKToken(str, out_tokens);
+  }
+}