mirror of https://github.com/apache/lucene.git
- LUCENE-478: Updated Unicode code point ranges for CJK
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1a933665d6
commit
5705f8dfd2
|
@ -12,7 +12,11 @@ Changes in runtime behavior
|
|||
Note that this problem still exists for 'a', e.g. in 'a-class' as
|
||||
'a' continues to be a stopword.
|
||||
(Daniel Naber)
|
||||
|
||||
|
||||
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
|
||||
split into CJ and K) in StandardAnalyzer.
|
||||
(John Want and Steven Rowe via Otis Gospodnetic)
|
||||
|
||||
New features
|
||||
|
||||
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
|
||||
|
|
|
@ -40,6 +40,12 @@ public class HTMLParser implements HTMLParserConstants {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||
*/
|
||||
public HTMLParser(File file) throws FileNotFoundException {
|
||||
this(new FileInputStream(file));
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
|
|
|
@ -25,6 +25,20 @@ public interface CharStream {
|
|||
*/
|
||||
char readChar() throws java.io.IOException;
|
||||
|
||||
/**
|
||||
* Returns the column position of the character last read.
|
||||
* @deprecated
|
||||
* @see #getEndColumn
|
||||
*/
|
||||
int getColumn();
|
||||
|
||||
/**
|
||||
* Returns the line number of the character last read.
|
||||
* @deprecated
|
||||
* @see #getEndLine
|
||||
*/
|
||||
int getLine();
|
||||
|
||||
/**
|
||||
* Returns the column number of the last character for current token (being
|
||||
* matched after the last call to BeginTOken).
|
||||
|
|
|
@ -103,21 +103,24 @@ TOKEN : { // token patterns
|
|||
"\u00c0"-"\u00d6",
|
||||
"\u00d8"-"\u00f6",
|
||||
"\u00f8"-"\u00ff",
|
||||
"\u0100"-"\u1fff"
|
||||
"\u0100"-"\u1fff",
|
||||
"\uffa0"-"\uffdc"
|
||||
]
|
||||
>
|
||||
| < CJ: // Chinese, Japanese
|
||||
[
|
||||
"\u3040"-"\u318f",
|
||||
"\u31f0"-"\u31ff",
|
||||
"\u3300"-"\u337f",
|
||||
"\u3400"-"\u3d2d",
|
||||
"\u3400"-"\u4db5",
|
||||
"\u4e00"-"\u9fff",
|
||||
"\uf900"-"\ufaff"
|
||||
"\uf900"-"\ufaff",
|
||||
"\uff65"-"\uff9f"
|
||||
]
|
||||
>
|
||||
| < KOREAN: // Korean
|
||||
[
|
||||
"\uac00"-"\ud7af"
|
||||
"\uac00"-"\ud7a3"
|
||||
]
|
||||
>
|
||||
| < #DIGIT: // unicode digits
|
||||
|
|
|
@ -41,54 +41,60 @@ private final void jjCheckNAddStates(int start)
|
|||
jjCheckNAdd(jjnextStates[start + 1]);
|
||||
}
|
||||
static final long[] jjbitVec0 = {
|
||||
0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L
|
||||
0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L
|
||||
};
|
||||
static final long[] jjbitVec2 = {
|
||||
0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
|
||||
};
|
||||
static final long[] jjbitVec3 = {
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L
|
||||
};
|
||||
static final long[] jjbitVec4 = {
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec5 = {
|
||||
0x3fffffffffffL, 0x0L, 0x0L, 0x0L
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec6 = {
|
||||
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||
0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec7 = {
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
|
||||
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||
};
|
||||
static final long[] jjbitVec8 = {
|
||||
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec9 = {
|
||||
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
|
||||
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec10 = {
|
||||
0x1600L, 0x0L, 0x0L, 0x0L
|
||||
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
|
||||
};
|
||||
static final long[] jjbitVec11 = {
|
||||
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
|
||||
0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL
|
||||
};
|
||||
static final long[] jjbitVec12 = {
|
||||
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
|
||||
0x1600L, 0x0L, 0x0L, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec13 = {
|
||||
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
|
||||
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
|
||||
};
|
||||
static final long[] jjbitVec14 = {
|
||||
0x0L, 0xffc000000000L, 0x0L, 0x0L
|
||||
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
|
||||
};
|
||||
static final long[] jjbitVec15 = {
|
||||
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
|
||||
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
|
||||
};
|
||||
static final long[] jjbitVec16 = {
|
||||
0x0L, 0x3ffL, 0x0L, 0x0L
|
||||
0x0L, 0xffc000000000L, 0x0L, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec17 = {
|
||||
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
|
||||
};
|
||||
static final long[] jjbitVec18 = {
|
||||
0x0L, 0x3ffL, 0x0L, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec19 = {
|
||||
0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||
};
|
||||
private final int jjMoveNfa_0(int startState, int curPos)
|
||||
|
@ -1012,8 +1018,10 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo
|
|||
return ((jjbitVec3[i2] & l2) != 0L);
|
||||
case 51:
|
||||
return ((jjbitVec4[i2] & l2) != 0L);
|
||||
case 61:
|
||||
case 77:
|
||||
return ((jjbitVec5[i2] & l2) != 0L);
|
||||
case 255:
|
||||
return ((jjbitVec6[i2] & l2) != 0L);
|
||||
default :
|
||||
if ((jjbitVec0[i1] & l1) != 0L)
|
||||
return true;
|
||||
|
@ -1025,9 +1033,9 @@ private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, lo
|
|||
switch(hiByte)
|
||||
{
|
||||
case 215:
|
||||
return ((jjbitVec7[i2] & l2) != 0L);
|
||||
return ((jjbitVec8[i2] & l2) != 0L);
|
||||
default :
|
||||
if ((jjbitVec6[i1] & l1) != 0L)
|
||||
if ((jjbitVec7[i1] & l1) != 0L)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
@ -1037,9 +1045,11 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
|
|||
switch(hiByte)
|
||||
{
|
||||
case 0:
|
||||
return ((jjbitVec9[i2] & l2) != 0L);
|
||||
return ((jjbitVec10[i2] & l2) != 0L);
|
||||
case 255:
|
||||
return ((jjbitVec11[i2] & l2) != 0L);
|
||||
default :
|
||||
if ((jjbitVec8[i1] & l1) != 0L)
|
||||
if ((jjbitVec9[i1] & l1) != 0L)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
@ -1049,18 +1059,18 @@ private static final boolean jjCanMove_3(int hiByte, int i1, int i2, long l1, lo
|
|||
switch(hiByte)
|
||||
{
|
||||
case 6:
|
||||
return ((jjbitVec12[i2] & l2) != 0L);
|
||||
case 11:
|
||||
return ((jjbitVec13[i2] & l2) != 0L);
|
||||
case 13:
|
||||
return ((jjbitVec14[i2] & l2) != 0L);
|
||||
case 14:
|
||||
case 11:
|
||||
return ((jjbitVec15[i2] & l2) != 0L);
|
||||
case 16:
|
||||
case 13:
|
||||
return ((jjbitVec16[i2] & l2) != 0L);
|
||||
case 14:
|
||||
return ((jjbitVec17[i2] & l2) != 0L);
|
||||
case 16:
|
||||
return ((jjbitVec18[i2] & l2) != 0L);
|
||||
default :
|
||||
if ((jjbitVec10[i1] & l1) != 0L)
|
||||
if ((jjbitVec11[i2] & l2) == 0L)
|
||||
if ((jjbitVec12[i1] & l1) != 0L)
|
||||
if ((jjbitVec13[i2] & l2) == 0L)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
|
@ -1072,11 +1082,13 @@ private static final boolean jjCanMove_4(int hiByte, int i1, int i2, long l1, lo
|
|||
switch(hiByte)
|
||||
{
|
||||
case 0:
|
||||
return ((jjbitVec9[i2] & l2) != 0L);
|
||||
return ((jjbitVec10[i2] & l2) != 0L);
|
||||
case 215:
|
||||
return ((jjbitVec7[i2] & l2) != 0L);
|
||||
return ((jjbitVec8[i2] & l2) != 0L);
|
||||
case 255:
|
||||
return ((jjbitVec11[i2] & l2) != 0L);
|
||||
default :
|
||||
if ((jjbitVec17[i1] & l1) != 0L)
|
||||
if ((jjbitVec19[i1] & l1) != 0L)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue