- LUCENE-478: Updated Unicode code point ranges for CJK

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2006-08-13 07:02:26 +00:00
parent 1a933665d6
commit 5705f8dfd2
5 changed files with 74 additions and 35 deletions

View File

@ -12,7 +12,11 @@ Changes in runtime behavior
Note that this problem still exists for 'a', e.g. in 'a-class' as
'a' continues to be a stopword.
(Daniel Naber)
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
split into CJ and K) in StandardAnalyzer.
(John Want and Steven Rowe via Otis Gospodnetic)
New features
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers

View File

@ -40,6 +40,12 @@ public class HTMLParser implements HTMLParserConstants {
}
}
/**
* @deprecated Use HTMLParser(FileInputStream) instead
*/
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)

View File

@ -25,6 +25,20 @@ public interface CharStream {
*/
char readChar() throws java.io.IOException;
/**
* Returns the column position of the character last read.
* @deprecated
* @see #getEndColumn
*/
int getColumn();
/**
* Returns the line number of the character last read.
* @deprecated
* @see #getEndLine
*/
int getLine();
/**
* Returns the column number of the last character for current token (being
* matched after the last call to BeginTOken).

View File

@ -103,21 +103,24 @@ TOKEN : { // token patterns
"\u00c0"-"\u00d6",
"\u00d8"-"\u00f6",
"\u00f8"-"\u00ff",
"\u0100"-"\u1fff"
"\u0100"-"\u1fff",
"\uffa0"-"\uffdc"
]
>
| < CJ: // Chinese, Japanese
[
"\u3040"-"\u318f",
"\u31f0"-"\u31ff",
"\u3300"-"\u337f",
"\u3400"-"\u3d2d",
"\u3400"-"\u4db5",
"\u4e00"-"\u9fff",
"\uf900"-"\ufaff"
"\uf900"-"\ufaff",
"\uff65"-"\uff9f"
]
>
| < KOREAN: // Korean
[
"\uac00"-"\ud7af"
"\uac00"-"\ud7a3"
]
>
| < #DIGIT: // unicode digits

View File

@ -41,54 +41,60 @@ private final void jjCheckNAddStates(int start)
jjCheckNAdd(jjnextStates[start + 1]);
}
static final long[] jjbitVec0 = {
0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L
0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L
};
static final long[] jjbitVec2 = {
0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
};
static final long[] jjbitVec3 = {
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L
};
static final long[] jjbitVec4 = {
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
};
static final long[] jjbitVec5 = {
0x3fffffffffffL, 0x0L, 0x0L, 0x0L
0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
};
static final long[] jjbitVec6 = {
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
};
static final long[] jjbitVec7 = {
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
};
static final long[] jjbitVec8 = {
0xfffffffeL, 0x0L, 0x0L, 0x0L
0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
};
static final long[] jjbitVec9 = {
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
0xfffffffeL, 0x0L, 0x0L, 0x0L
};
static final long[] jjbitVec10 = {
0x1600L, 0x0L, 0x0L, 0x0L
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
};
static final long[] jjbitVec11 = {
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL
};
static final long[] jjbitVec12 = {
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
0x1600L, 0x0L, 0x0L, 0x0L
};
static final long[] jjbitVec13 = {
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
};
static final long[] jjbitVec14 = {
0x0L, 0xffc000000000L, 0x0L, 0x0L
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
};
static final long[] jjbitVec15 = {
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
};
static final long[] jjbitVec16 = {
0x0L, 0x3ffL, 0x0L, 0x0L
0x0L, 0xffc000000000L, 0x0L, 0x0L
};
static final long[] jjbitVec17 = {
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
};
static final long[] jjbitVec18 = {
0x0L, 0x3ffL, 0x0L, 0x0L
};
static final long[] jjbitVec19 = {
0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
};
private final int jjMoveNfa_0(int startState, int curPos)
@ -1012,8 +1018,10 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo
return ((jjbitVec3[i2] & l2) != 0L);
case 51:
return ((jjbitVec4[i2] & l2) != 0L);
case 61:
case 77:
return ((jjbitVec5[i2] & l2) != 0L);
case 255:
return ((jjbitVec6[i2] & l2) != 0L);
default :
if ((jjbitVec0[i1] & l1) != 0L)
return true;
@ -1025,9 +1033,9 @@ private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, lo
switch(hiByte)
{
case 215:
return ((jjbitVec7[i2] & l2) != 0L);
return ((jjbitVec8[i2] & l2) != 0L);
default :
if ((jjbitVec6[i1] & l1) != 0L)
if ((jjbitVec7[i1] & l1) != 0L)
return true;
return false;
}
@ -1037,9 +1045,11 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
switch(hiByte)
{
case 0:
return ((jjbitVec9[i2] & l2) != 0L);
return ((jjbitVec10[i2] & l2) != 0L);
case 255:
return ((jjbitVec11[i2] & l2) != 0L);
default :
if ((jjbitVec8[i1] & l1) != 0L)
if ((jjbitVec9[i1] & l1) != 0L)
return true;
return false;
}
@ -1049,18 +1059,18 @@ private static final boolean jjCanMove_3(int hiByte, int i1, int i2, long l1, lo
switch(hiByte)
{
case 6:
return ((jjbitVec12[i2] & l2) != 0L);
case 11:
return ((jjbitVec13[i2] & l2) != 0L);
case 13:
return ((jjbitVec14[i2] & l2) != 0L);
case 14:
case 11:
return ((jjbitVec15[i2] & l2) != 0L);
case 16:
case 13:
return ((jjbitVec16[i2] & l2) != 0L);
case 14:
return ((jjbitVec17[i2] & l2) != 0L);
case 16:
return ((jjbitVec18[i2] & l2) != 0L);
default :
if ((jjbitVec10[i1] & l1) != 0L)
if ((jjbitVec11[i2] & l2) == 0L)
if ((jjbitVec12[i1] & l1) != 0L)
if ((jjbitVec13[i2] & l2) == 0L)
return false;
else
return true;
@ -1072,11 +1082,13 @@ private static final boolean jjCanMove_4(int hiByte, int i1, int i2, long l1, lo
switch(hiByte)
{
case 0:
return ((jjbitVec9[i2] & l2) != 0L);
return ((jjbitVec10[i2] & l2) != 0L);
case 215:
return ((jjbitVec7[i2] & l2) != 0L);
return ((jjbitVec8[i2] & l2) != 0L);
case 255:
return ((jjbitVec11[i2] & l2) != 0L);
default :
if ((jjbitVec17[i1] & l1) != 0L)
if ((jjbitVec19[i1] & l1) != 0L)
return true;
return false;
}