mirror of https://github.com/apache/lucene.git
- Modified some CJK Unicode code point ranges in StandardTokenizer.jj,
and added a few more of them to increase CJK character coverage. Also documented some of the ranges. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431152 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5705f8dfd2
commit
a9691a40e8
|
@ -15,7 +15,12 @@ Changes in runtime behavior
|
||||||
|
|
||||||
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
|
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
|
||||||
split into CJ and K) in StandardAnalyzer.
|
split into CJ and K) in StandardAnalyzer.
|
||||||
(John Want and Steven Rowe via Otis Gospodnetic)
|
(John Wang and Steven Rowe via Otis Gospodnetic)
|
||||||
|
|
||||||
|
3. Modified some CJK Unicode code point ranges in StandardTokenizer.jj,
|
||||||
|
and added a few more of them to increase CJK character coverage.
|
||||||
|
Also documented some of the ranges.
|
||||||
|
(Otis Gospodnetic)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
|
|
|
@ -110,17 +110,32 @@ TOKEN : { // token patterns
|
||||||
| < CJ: // Chinese, Japanese
|
| < CJ: // Chinese, Japanese
|
||||||
[
|
[
|
||||||
"\u3040"-"\u318f",
|
"\u3040"-"\u318f",
|
||||||
"\u31f0"-"\u31ff",
|
"\u3100"-"\u312f", // BaPoMoFo (aka ZhuYin)
|
||||||
|
"\u3040"-"\u309F", // Japanese: Hiragana
|
||||||
|
"\u30A0"-"\u30FF", // Japanese: Katakana
|
||||||
|
"\u31F0"-"\u31FF", // Japanese: Katakana Phonetic Extensions
|
||||||
"\u3300"-"\u337f",
|
"\u3300"-"\u337f",
|
||||||
"\u3400"-"\u4db5",
|
"\u3400"-"\u4dbf", // CJK Unified Ideographs Ext. A
|
||||||
"\u4e00"-"\u9fff",
|
"\u4e00"-"\u9fff",
|
||||||
"\uf900"-"\ufaff",
|
"\uf900"-"\ufaff",
|
||||||
"\uff65"-"\uff9f"
|
"\uff65"-"\uff9f"
|
||||||
|
|
||||||
|
// Otis: consider adding these, too
|
||||||
|
//
|
||||||
|
// 2E80-2EFF: CJK Radicals Supplement
|
||||||
|
// 2F00-2FDF: Kangxi Radicals
|
||||||
|
// 3190-319F: Kanbun
|
||||||
|
// 31C0-31EF: CJK Strokes
|
||||||
|
// 4E00-9FBF: CJK Unified
|
||||||
|
// F900-FAFF: CJK Compatibility Ideographs
|
||||||
|
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < KOREAN: // Korean
|
| < KOREAN: // Korean
|
||||||
[
|
[
|
||||||
"\uac00"-"\ud7a3"
|
"\uac00"-"\ud7af", // Hangul Syllables
|
||||||
|
"\u1100"-"\u11ff" // Hangul Jamo
|
||||||
|
// "\uac00"-"\ud7a3"
|
||||||
]
|
]
|
||||||
>
|
>
|
||||||
| < #DIGIT: // unicode digits
|
| < #DIGIT: // unicode digits
|
||||||
|
|
|
@ -53,16 +53,16 @@ static final long[] jjbitVec4 = {
|
||||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec5 = {
|
static final long[] jjbitVec5 = {
|
||||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec6 = {
|
static final long[] jjbitVec6 = {
|
||||||
0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
|
0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec7 = {
|
static final long[] jjbitVec7 = {
|
||||||
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
0x20000L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec8 = {
|
static final long[] jjbitVec8 = {
|
||||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
|
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
|
||||||
};
|
};
|
||||||
static final long[] jjbitVec9 = {
|
static final long[] jjbitVec9 = {
|
||||||
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
||||||
|
|
Loading…
Reference in New Issue