- LUCENE-478: Updated Unicode code point ranges for CJK

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2006-08-13 07:02:26 +00:00
parent 1a933665d6
commit 5705f8dfd2
5 changed files with 74 additions and 35 deletions

View File

@ -13,6 +13,10 @@ Changes in runtime behavior
'a' continues to be a stopword. 'a' continues to be a stopword.
(Daniel Naber) (Daniel Naber)
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
split into CJ and K) in StandardAnalyzer.
(John Want and Steven Rowe via Otis Gospodnetic)
New features New features
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers 1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers

View File

@ -40,6 +40,12 @@ public class HTMLParser implements HTMLParserConstants {
} }
} }
/**
* @deprecated Use HTMLParser(FileInputStream) instead
*/
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException { public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null) if (pipeIn == null)

View File

@ -25,6 +25,20 @@ public interface CharStream {
*/ */
char readChar() throws java.io.IOException; char readChar() throws java.io.IOException;
/**
* Returns the column position of the character last read.
* @deprecated
* @see #getEndColumn
*/
int getColumn();
/**
* Returns the line number of the character last read.
* @deprecated
* @see #getEndLine
*/
int getLine();
/** /**
* Returns the column number of the last character for current token (being * Returns the column number of the last character for current token (being
* matched after the last call to BeginTOken). * matched after the last call to BeginTOken).

View File

@ -103,21 +103,24 @@ TOKEN : { // token patterns
"\u00c0"-"\u00d6", "\u00c0"-"\u00d6",
"\u00d8"-"\u00f6", "\u00d8"-"\u00f6",
"\u00f8"-"\u00ff", "\u00f8"-"\u00ff",
"\u0100"-"\u1fff" "\u0100"-"\u1fff",
"\uffa0"-"\uffdc"
] ]
> >
| < CJ: // Chinese, Japanese | < CJ: // Chinese, Japanese
[ [
"\u3040"-"\u318f", "\u3040"-"\u318f",
"\u31f0"-"\u31ff",
"\u3300"-"\u337f", "\u3300"-"\u337f",
"\u3400"-"\u3d2d", "\u3400"-"\u4db5",
"\u4e00"-"\u9fff", "\u4e00"-"\u9fff",
"\uf900"-"\ufaff" "\uf900"-"\ufaff",
"\uff65"-"\uff9f"
] ]
> >
| < KOREAN: // Korean | < KOREAN: // Korean
[ [
"\uac00"-"\ud7af" "\uac00"-"\ud7a3"
] ]
> >
| < #DIGIT: // unicode digits | < #DIGIT: // unicode digits

View File

@ -41,54 +41,60 @@ private final void jjCheckNAddStates(int start)
jjCheckNAdd(jjnextStates[start + 1]); jjCheckNAdd(jjnextStates[start + 1]);
} }
static final long[] jjbitVec0 = { static final long[] jjbitVec0 = {
0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L 0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L
}; };
static final long[] jjbitVec2 = { static final long[] jjbitVec2 = {
0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
}; };
static final long[] jjbitVec3 = { static final long[] jjbitVec3 = {
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L
}; };
static final long[] jjbitVec4 = { static final long[] jjbitVec4 = {
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
}; };
static final long[] jjbitVec5 = { static final long[] jjbitVec5 = {
0x3fffffffffffL, 0x0L, 0x0L, 0x0L 0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
}; };
static final long[] jjbitVec6 = { static final long[] jjbitVec6 = {
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL 0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
}; };
static final long[] jjbitVec7 = { static final long[] jjbitVec7 = {
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L 0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
}; };
static final long[] jjbitVec8 = { static final long[] jjbitVec8 = {
0xfffffffeL, 0x0L, 0x0L, 0x0L 0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
}; };
static final long[] jjbitVec9 = { static final long[] jjbitVec9 = {
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL 0xfffffffeL, 0x0L, 0x0L, 0x0L
}; };
static final long[] jjbitVec10 = { static final long[] jjbitVec10 = {
0x1600L, 0x0L, 0x0L, 0x0L 0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
}; };
static final long[] jjbitVec11 = { static final long[] jjbitVec11 = {
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L 0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL
}; };
static final long[] jjbitVec12 = { static final long[] jjbitVec12 = {
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L 0x1600L, 0x0L, 0x0L, 0x0L
}; };
static final long[] jjbitVec13 = { static final long[] jjbitVec13 = {
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L 0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
}; };
static final long[] jjbitVec14 = { static final long[] jjbitVec14 = {
0x0L, 0xffc000000000L, 0x0L, 0x0L 0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
}; };
static final long[] jjbitVec15 = { static final long[] jjbitVec15 = {
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L 0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
}; };
static final long[] jjbitVec16 = { static final long[] jjbitVec16 = {
0x0L, 0x3ffL, 0x0L, 0x0L 0x0L, 0xffc000000000L, 0x0L, 0x0L
}; };
static final long[] jjbitVec17 = { static final long[] jjbitVec17 = {
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
};
static final long[] jjbitVec18 = {
0x0L, 0x3ffL, 0x0L, 0x0L
};
static final long[] jjbitVec19 = {
0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL 0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
}; };
private final int jjMoveNfa_0(int startState, int curPos) private final int jjMoveNfa_0(int startState, int curPos)
@ -1012,8 +1018,10 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo
return ((jjbitVec3[i2] & l2) != 0L); return ((jjbitVec3[i2] & l2) != 0L);
case 51: case 51:
return ((jjbitVec4[i2] & l2) != 0L); return ((jjbitVec4[i2] & l2) != 0L);
case 61: case 77:
return ((jjbitVec5[i2] & l2) != 0L); return ((jjbitVec5[i2] & l2) != 0L);
case 255:
return ((jjbitVec6[i2] & l2) != 0L);
default : default :
if ((jjbitVec0[i1] & l1) != 0L) if ((jjbitVec0[i1] & l1) != 0L)
return true; return true;
@ -1025,9 +1033,9 @@ private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, lo
switch(hiByte) switch(hiByte)
{ {
case 215: case 215:
return ((jjbitVec7[i2] & l2) != 0L); return ((jjbitVec8[i2] & l2) != 0L);
default : default :
if ((jjbitVec6[i1] & l1) != 0L) if ((jjbitVec7[i1] & l1) != 0L)
return true; return true;
return false; return false;
} }
@ -1037,9 +1045,11 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
switch(hiByte) switch(hiByte)
{ {
case 0: case 0:
return ((jjbitVec9[i2] & l2) != 0L); return ((jjbitVec10[i2] & l2) != 0L);
case 255:
return ((jjbitVec11[i2] & l2) != 0L);
default : default :
if ((jjbitVec8[i1] & l1) != 0L) if ((jjbitVec9[i1] & l1) != 0L)
return true; return true;
return false; return false;
} }
@ -1049,18 +1059,18 @@ private static final boolean jjCanMove_3(int hiByte, int i1, int i2, long l1, lo
switch(hiByte) switch(hiByte)
{ {
case 6: case 6:
return ((jjbitVec12[i2] & l2) != 0L);
case 11:
return ((jjbitVec13[i2] & l2) != 0L);
case 13:
return ((jjbitVec14[i2] & l2) != 0L); return ((jjbitVec14[i2] & l2) != 0L);
case 14: case 11:
return ((jjbitVec15[i2] & l2) != 0L); return ((jjbitVec15[i2] & l2) != 0L);
case 16: case 13:
return ((jjbitVec16[i2] & l2) != 0L); return ((jjbitVec16[i2] & l2) != 0L);
case 14:
return ((jjbitVec17[i2] & l2) != 0L);
case 16:
return ((jjbitVec18[i2] & l2) != 0L);
default : default :
if ((jjbitVec10[i1] & l1) != 0L) if ((jjbitVec12[i1] & l1) != 0L)
if ((jjbitVec11[i2] & l2) == 0L) if ((jjbitVec13[i2] & l2) == 0L)
return false; return false;
else else
return true; return true;
@ -1072,11 +1082,13 @@ private static final boolean jjCanMove_4(int hiByte, int i1, int i2, long l1, lo
switch(hiByte) switch(hiByte)
{ {
case 0: case 0:
return ((jjbitVec9[i2] & l2) != 0L); return ((jjbitVec10[i2] & l2) != 0L);
case 215: case 215:
return ((jjbitVec7[i2] & l2) != 0L); return ((jjbitVec8[i2] & l2) != 0L);
case 255:
return ((jjbitVec11[i2] & l2) != 0L);
default : default :
if ((jjbitVec17[i1] & l1) != 0L) if ((jjbitVec19[i1] & l1) != 0L)
return true; return true;
return false; return false;
} }