LUCENE-973: add test case for CJKAnalyzer; fix trailing empty string bug

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@785287 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-06-16 16:38:39 +00:00
parent 2f7b9df887
commit 835c405be0
2 changed files with 190 additions and 14 deletions

View File

@ -37,7 +37,18 @@ import java.io.Reader;
*/
public final class CJKTokenizer extends Tokenizer {
//~ Static fields/initializers ---------------------------------------------
/** Word token type */
static final int WORD_TYPE = 0;
/** Single byte token type */
static final int SINGLE_TOKEN_TYPE = 1;
/** Double byte token type */
static final int DOUBLE_TOKEN_TYPE = 2;
/** Names for token types */
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
/** Max word length */
private static final int MAX_WORD_LEN = 255;
@ -68,7 +79,7 @@ public final class CJKTokenizer extends Tokenizer {
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */
private String tokenType = "word";
private int tokenType = WORD_TYPE;
/**
* tag: previous character is a cached double-byte character "C1C2C3C4"
@ -105,12 +116,15 @@ public final class CJKTokenizer extends Tokenizer {
public final Token next(final Token reusableToken) throws java.io.IOException {
/** how many character(s) has been stored in buffer */
assert reusableToken != null;
int length = 0;
/** the position used to create Token */
int start = offset;
while(true) { // loop until we find a non-empty token
while (true) {
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) { // loop until we've found a full token
/** current character */
char c;
@ -150,7 +164,7 @@ public final class CJKTokenizer extends Tokenizer {
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
int i = (int) c;
if (i >= 65281 && i <= 65374) {
/** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
i = i - 65248;
c = (char) i;
}
@ -165,19 +179,17 @@ public final class CJKTokenizer extends Tokenizer {
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
} else if (tokenType == "double") {
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
// "javaC1C2C3C4linux" <br>
// ^--: the previous non-ASCII
// : the current character
offset--;
bufferIndex--;
tokenType = "single";
if (preIsTokened == true) {
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
break;
} else {
break;
@ -186,7 +198,7 @@ public final class CJKTokenizer extends Tokenizer {
// store the LowerCase(c) in the buffer
buffer[length++] = Character.toLowerCase(c);
tokenType = "single";
tokenType = SINGLE_TOKEN_TYPE;
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN) {
@ -206,9 +218,9 @@ public final class CJKTokenizer extends Tokenizer {
if (length == 0) {
start = offset - 1;
buffer[length++] = c;
tokenType = "double";
tokenType = DOUBLE_TOKEN_TYPE;
} else {
if (tokenType == "single") {
if (tokenType == SINGLE_TOKEN_TYPE) {
offset--;
bufferIndex--;
@ -216,7 +228,7 @@ public final class CJKTokenizer extends Tokenizer {
break;
} else {
buffer[length++] = c;
tokenType = "double";
tokenType = DOUBLE_TOKEN_TYPE;
if (length == 2) {
offset--;
@ -238,7 +250,16 @@ public final class CJKTokenizer extends Tokenizer {
}
}
}
if (length > 0) {
return reusableToken.reinit
(buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
} else if (dataLen == -1) {
return null;
}
return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
// Cycle back and try for the next token (don't
// return an empty string)
}
}
}

View File

@ -0,0 +1,155 @@
package org.apache.lucene.analysis.cjk;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
public class TestCJKTokenizer extends TestCase{
public Token newToken(String termText, int start, int end, int type) {
Token token = new Token(start, end);
token.setTermBuffer(termText);
token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
return token;
}
public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
int i = 0;
System.out.println("string[" + str + "]");
System.out.print("tokens[");
final Token reusableToken = new Token();
for (Token token = tokenizer.next(reusableToken) ;
token != null ;
token = tokenizer.next(reusableToken) ) {
if (token.term().equals(out_tokens[i].term())
&& token.startOffset() == out_tokens[i].startOffset()
&& token.endOffset() == out_tokens[i].endOffset()
&& token.type().equals(out_tokens[i].type()) ) {
System.out.print( token.term() + " ");
}
else {
fail(token.term() + " (start: " + token.startOffset()
+ " end: " + token.endOffset() + " type: " + token.type() + ") != "
+ out_tokens[i].term() + " (start: " + out_tokens[i].startOffset()
+ " end: " + out_tokens[i].endOffset()
+ " type: " + out_tokens[i].type() + ")");
break;
}
++i;
}
System.out.println("]" + System.getProperty("line.separator"));
}
public void testJa1() throws IOException {
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
Token[] out_tokens = {
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testJa2() throws IOException {
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
Token[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testC() throws IOException {
String str = "abc defgh ijklmn opqrstu vwxy z";
Token[] out_tokens = {
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
};
checkCJKToken(str, out_tokens);
}
public void testMix() throws IOException {
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
Token[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testMix2() throws IOException {
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
Token[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
};
checkCJKToken(str, out_tokens);
}
public void testSingleChar() throws IOException {
String str = "\u4e00";
Token[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
};
checkCJKToken(str, out_tokens);
}
}