mirror of https://github.com/apache/lucene.git
- LUCENE-478: Updated Unicode code point ranges for CJK
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1a933665d6
commit
5705f8dfd2
|
@ -12,7 +12,11 @@ Changes in runtime behavior
|
||||||
Note that this problem still exists for 'a', e.g. in 'a-class' as
|
Note that this problem still exists for 'a', e.g. in 'a-class' as
|
||||||
'a' continues to be a stopword.
|
'a' continues to be a stopword.
|
||||||
(Daniel Naber)
|
(Daniel Naber)
|
||||||
|
|
||||||
|
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
|
||||||
|
split into CJ and K) in StandardAnalyzer.
|
||||||
|
(John Want and Steven Rowe via Otis Gospodnetic)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
|
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
|
||||||
|
|
|
@ -40,6 +40,12 @@ public class HTMLParser implements HTMLParserConstants {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||||
|
*/
|
||||||
|
public HTMLParser(File file) throws FileNotFoundException {
|
||||||
|
this(new FileInputStream(file));
|
||||||
|
}
|
||||||
|
|
||||||
public String getTitle() throws IOException, InterruptedException {
|
public String getTitle() throws IOException, InterruptedException {
|
||||||
if (pipeIn == null)
|
if (pipeIn == null)
|
||||||
|
|
|
@ -25,6 +25,20 @@ public interface CharStream {
|
||||||
*/
|
*/
|
||||||
char readChar() throws java.io.IOException;
|
char readChar() throws java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the column position of the character last read.
|
||||||
|
* @deprecated
|
||||||
|
* @see #getEndColumn
|
||||||
|
*/
|
||||||
|
int getColumn();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the line number of the character last read.
|
||||||
|
* @deprecated
|
||||||
|
* @see #getEndLine
|
||||||
|
*/
|
||||||
|
int getLine();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the column number of the last character for current token (being
|
* Returns the column number of the last character for current token (being
|
||||||
* matched after the last call to BeginTOken).
|
* matched after the last call to BeginTOken).
|
||||||
|
|
|
@ -103,21 +103,24 @@ TOKEN : { // token patterns
|
||||||
"\u00c0"-"\u00d6",
|
"\u00c0"-"\u00d6",
|
||||||
"\u00d8"-"\u00f6",
|
"\u00d8"-"\u00f6",
|
||||||
"\u00f8"-"\u00ff",
|
"\u00f8"-"\u00ff",
|
||||||
"\u0100"-"\u1fff"
|
"\u0100"-"\u1fff",
|
||||||
|
"\uffa0"-"\uffdc"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < CJ: // Chinese, Japanese
|
| < CJ: // Chinese, Japanese
|
||||||
[
|
[
|
||||||
"\u3040"-"\u318f",
|
"\u3040"-"\u318f",
|
||||||
|
"\u31f0"-"\u31ff",
|
||||||
"\u3300"-"\u337f",
|
"\u3300"-"\u337f",
|
||||||
"\u3400"-"\u3d2d",
|
"\u3400"-"\u4db5",
|
||||||
"\u4e00"-"\u9fff",
|
"\u4e00"-"\u9fff",
|
||||||
"\uf900"-"\ufaff"
|
"\uf900"-"\ufaff",
|
||||||
|
"\uff65"-"\uff9f"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < KOREAN: // Korean
|
| < KOREAN: // Korean
|
||||||
[
|
[
|
||||||
"\uac00"-"\ud7af"
|
"\uac00"-"\ud7a3"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < #DIGIT: // unicode digits
|
| < #DIGIT: // unicode digits
|
||||||
|
|
|
@ -41,54 +41,60 @@ private final void jjCheckNAddStates(int start)
|
||||||
jjCheckNAdd(jjnextStates[start + 1]);
|
jjCheckNAdd(jjnextStates[start + 1]);
|
||||||
}
|
}
|
||||||
static final long[] jjbitVec0 = {
|
static final long[] jjbitVec0 = {
|
||||||
0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L
|
0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec2 = {
|
static final long[] jjbitVec2 = {
|
||||||
0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
|
0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec3 = {
|
static final long[] jjbitVec3 = {
|
||||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec4 = {
|
static final long[] jjbitVec4 = {
|
||||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec5 = {
|
static final long[] jjbitVec5 = {
|
||||||
0x3fffffffffffL, 0x0L, 0x0L, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec6 = {
|
static final long[] jjbitVec6 = {
|
||||||
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec7 = {
|
static final long[] jjbitVec7 = {
|
||||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
|
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec8 = {
|
static final long[] jjbitVec8 = {
|
||||||
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec9 = {
|
static final long[] jjbitVec9 = {
|
||||||
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
|
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec10 = {
|
static final long[] jjbitVec10 = {
|
||||||
0x1600L, 0x0L, 0x0L, 0x0L
|
0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec11 = {
|
static final long[] jjbitVec11 = {
|
||||||
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
|
0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec12 = {
|
static final long[] jjbitVec12 = {
|
||||||
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
|
0x1600L, 0x0L, 0x0L, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec13 = {
|
static final long[] jjbitVec13 = {
|
||||||
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
|
0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec14 = {
|
static final long[] jjbitVec14 = {
|
||||||
0x0L, 0xffc000000000L, 0x0L, 0x0L
|
0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec15 = {
|
static final long[] jjbitVec15 = {
|
||||||
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
|
0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec16 = {
|
static final long[] jjbitVec16 = {
|
||||||
0x0L, 0x3ffL, 0x0L, 0x0L
|
0x0L, 0xffc000000000L, 0x0L, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec17 = {
|
static final long[] jjbitVec17 = {
|
||||||
|
0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
|
||||||
|
};
|
||||||
|
static final long[] jjbitVec18 = {
|
||||||
|
0x0L, 0x3ffL, 0x0L, 0x0L
|
||||||
|
};
|
||||||
|
static final long[] jjbitVec19 = {
|
||||||
0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||||
};
|
};
|
||||||
private final int jjMoveNfa_0(int startState, int curPos)
|
private final int jjMoveNfa_0(int startState, int curPos)
|
||||||
|
@ -1012,8 +1018,10 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo
|
||||||
return ((jjbitVec3[i2] & l2) != 0L);
|
return ((jjbitVec3[i2] & l2) != 0L);
|
||||||
case 51:
|
case 51:
|
||||||
return ((jjbitVec4[i2] & l2) != 0L);
|
return ((jjbitVec4[i2] & l2) != 0L);
|
||||||
case 61:
|
case 77:
|
||||||
return ((jjbitVec5[i2] & l2) != 0L);
|
return ((jjbitVec5[i2] & l2) != 0L);
|
||||||
|
case 255:
|
||||||
|
return ((jjbitVec6[i2] & l2) != 0L);
|
||||||
default :
|
default :
|
||||||
if ((jjbitVec0[i1] & l1) != 0L)
|
if ((jjbitVec0[i1] & l1) != 0L)
|
||||||
return true;
|
return true;
|
||||||
|
@ -1025,9 +1033,9 @@ private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, lo
|
||||||
switch(hiByte)
|
switch(hiByte)
|
||||||
{
|
{
|
||||||
case 215:
|
case 215:
|
||||||
return ((jjbitVec7[i2] & l2) != 0L);
|
return ((jjbitVec8[i2] & l2) != 0L);
|
||||||
default :
|
default :
|
||||||
if ((jjbitVec6[i1] & l1) != 0L)
|
if ((jjbitVec7[i1] & l1) != 0L)
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1037,9 +1045,11 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
|
||||||
switch(hiByte)
|
switch(hiByte)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
return ((jjbitVec9[i2] & l2) != 0L);
|
return ((jjbitVec10[i2] & l2) != 0L);
|
||||||
|
case 255:
|
||||||
|
return ((jjbitVec11[i2] & l2) != 0L);
|
||||||
default :
|
default :
|
||||||
if ((jjbitVec8[i1] & l1) != 0L)
|
if ((jjbitVec9[i1] & l1) != 0L)
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1049,18 +1059,18 @@ private static final boolean jjCanMove_3(int hiByte, int i1, int i2, long l1, lo
|
||||||
switch(hiByte)
|
switch(hiByte)
|
||||||
{
|
{
|
||||||
case 6:
|
case 6:
|
||||||
return ((jjbitVec12[i2] & l2) != 0L);
|
|
||||||
case 11:
|
|
||||||
return ((jjbitVec13[i2] & l2) != 0L);
|
|
||||||
case 13:
|
|
||||||
return ((jjbitVec14[i2] & l2) != 0L);
|
return ((jjbitVec14[i2] & l2) != 0L);
|
||||||
case 14:
|
case 11:
|
||||||
return ((jjbitVec15[i2] & l2) != 0L);
|
return ((jjbitVec15[i2] & l2) != 0L);
|
||||||
case 16:
|
case 13:
|
||||||
return ((jjbitVec16[i2] & l2) != 0L);
|
return ((jjbitVec16[i2] & l2) != 0L);
|
||||||
|
case 14:
|
||||||
|
return ((jjbitVec17[i2] & l2) != 0L);
|
||||||
|
case 16:
|
||||||
|
return ((jjbitVec18[i2] & l2) != 0L);
|
||||||
default :
|
default :
|
||||||
if ((jjbitVec10[i1] & l1) != 0L)
|
if ((jjbitVec12[i1] & l1) != 0L)
|
||||||
if ((jjbitVec11[i2] & l2) == 0L)
|
if ((jjbitVec13[i2] & l2) == 0L)
|
||||||
return false;
|
return false;
|
||||||
else
|
else
|
||||||
return true;
|
return true;
|
||||||
|
@ -1072,11 +1082,13 @@ private static final boolean jjCanMove_4(int hiByte, int i1, int i2, long l1, lo
|
||||||
switch(hiByte)
|
switch(hiByte)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
return ((jjbitVec9[i2] & l2) != 0L);
|
return ((jjbitVec10[i2] & l2) != 0L);
|
||||||
case 215:
|
case 215:
|
||||||
return ((jjbitVec7[i2] & l2) != 0L);
|
return ((jjbitVec8[i2] & l2) != 0L);
|
||||||
|
case 255:
|
||||||
|
return ((jjbitVec11[i2] & l2) != 0L);
|
||||||
default :
|
default :
|
||||||
if ((jjbitVec17[i1] & l1) != 0L)
|
if ((jjbitVec19[i1] & l1) != 0L)
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue