mirror of https://github.com/apache/lucene.git
LUCENE-973: add test case for CJKAnalyzer; fix trailing empty string bug
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@785287 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2f7b9df887
commit
835c405be0
|
@ -37,7 +37,18 @@ import java.io.Reader;
|
|||
*/
|
||||
public final class CJKTokenizer extends Tokenizer {
|
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
/** Word token type */
|
||||
static final int WORD_TYPE = 0;
|
||||
|
||||
/** Single byte token type */
|
||||
static final int SINGLE_TOKEN_TYPE = 1;
|
||||
|
||||
/** Double byte token type */
|
||||
static final int DOUBLE_TOKEN_TYPE = 2;
|
||||
|
||||
/** Names for token types */
|
||||
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
|
||||
|
||||
/** Max word length */
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
|
||||
|
@ -68,7 +79,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
/** word type: single=>ASCII double=>non-ASCII word=>default */
|
||||
private String tokenType = "word";
|
||||
private int tokenType = WORD_TYPE;
|
||||
|
||||
/**
|
||||
* tag: previous character is a cached double-byte character "C1C2C3C4"
|
||||
|
@ -105,12 +116,15 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
/** how many character(s) has been stored in buffer */
|
||||
assert reusableToken != null;
|
||||
int length = 0;
|
||||
|
||||
/** the position used to create Token */
|
||||
int start = offset;
|
||||
while(true) { // loop until we find a non-empty token
|
||||
|
||||
while (true) {
|
||||
int length = 0;
|
||||
|
||||
/** the position used to create Token */
|
||||
int start = offset;
|
||||
|
||||
while (true) { // loop until we've found a full token
|
||||
/** current character */
|
||||
char c;
|
||||
|
||||
|
@ -150,7 +164,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
|
||||
int i = (int) c;
|
||||
if (i >= 65281 && i <= 65374) {
|
||||
/** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
|
||||
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
|
||||
i = i - 65248;
|
||||
c = (char) i;
|
||||
}
|
||||
|
@ -165,19 +179,17 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
// ^--: the current character begin to token the ASCII
|
||||
// letter
|
||||
start = offset - 1;
|
||||
} else if (tokenType == "double") {
|
||||
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
|
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the previous non-ASCII
|
||||
// : the current character
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
tokenType = "single";
|
||||
|
||||
if (preIsTokened == true) {
|
||||
// there is only one non-ASCII has been stored
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
|
||||
break;
|
||||
} else {
|
||||
break;
|
||||
|
@ -186,7 +198,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
|
||||
// store the LowerCase(c) in the buffer
|
||||
buffer[length++] = Character.toLowerCase(c);
|
||||
tokenType = "single";
|
||||
tokenType = SINGLE_TOKEN_TYPE;
|
||||
|
||||
// break the procedure if buffer overflowed!
|
||||
if (length == MAX_WORD_LEN) {
|
||||
|
@ -206,9 +218,9 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
if (length == 0) {
|
||||
start = offset - 1;
|
||||
buffer[length++] = c;
|
||||
tokenType = "double";
|
||||
tokenType = DOUBLE_TOKEN_TYPE;
|
||||
} else {
|
||||
if (tokenType == "single") {
|
||||
if (tokenType == SINGLE_TOKEN_TYPE) {
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
|
||||
|
@ -216,7 +228,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
break;
|
||||
} else {
|
||||
buffer[length++] = c;
|
||||
tokenType = "double";
|
||||
tokenType = DOUBLE_TOKEN_TYPE;
|
||||
|
||||
if (length == 2) {
|
||||
offset--;
|
||||
|
@ -238,7 +250,16 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (length > 0) {
|
||||
return reusableToken.reinit
|
||||
(buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
|
||||
} else if (dataLen == -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
|
||||
// Cycle back and try for the next token (don't
|
||||
// return an empty string)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,155 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
|
||||
public class TestCJKTokenizer extends TestCase{
|
||||
|
||||
public Token newToken(String termText, int start, int end, int type) {
|
||||
Token token = new Token(start, end);
|
||||
token.setTermBuffer(termText);
|
||||
token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
|
||||
return token;
|
||||
}
|
||||
|
||||
public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
|
||||
CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
|
||||
int i = 0;
|
||||
System.out.println("string[" + str + "]");
|
||||
System.out.print("tokens[");
|
||||
final Token reusableToken = new Token();
|
||||
for (Token token = tokenizer.next(reusableToken) ;
|
||||
token != null ;
|
||||
token = tokenizer.next(reusableToken) ) {
|
||||
if (token.term().equals(out_tokens[i].term())
|
||||
&& token.startOffset() == out_tokens[i].startOffset()
|
||||
&& token.endOffset() == out_tokens[i].endOffset()
|
||||
&& token.type().equals(out_tokens[i].type()) ) {
|
||||
System.out.print( token.term() + " ");
|
||||
}
|
||||
else {
|
||||
fail(token.term() + " (start: " + token.startOffset()
|
||||
+ " end: " + token.endOffset() + " type: " + token.type() + ") != "
|
||||
+ out_tokens[i].term() + " (start: " + out_tokens[i].startOffset()
|
||||
+ " end: " + out_tokens[i].endOffset()
|
||||
+ " type: " + out_tokens[i].type() + ")");
|
||||
break;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
System.out.println("]" + System.getProperty("line.separator"));
|
||||
}
|
||||
|
||||
public void testJa1() throws IOException {
|
||||
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
|
||||
|
||||
Token[] out_tokens = {
|
||||
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testJa2() throws IOException {
|
||||
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
|
||||
|
||||
Token[] out_tokens = {
|
||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testC() throws IOException {
|
||||
String str = "abc defgh ijklmn opqrstu vwxy z";
|
||||
|
||||
Token[] out_tokens = {
|
||||
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testMix() throws IOException {
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||
|
||||
Token[] out_tokens = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testMix2() throws IOException {
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
||||
|
||||
Token[] out_tokens = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testSingleChar() throws IOException {
|
||||
String str = "\u4e00";
|
||||
|
||||
Token[] out_tokens = {
|
||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue