mirror of https://github.com/apache/lucene.git
- Modified some CJK Unicode code point ranges in StandardTokenizer.jj,
and added a few more of them to increase CJK character coverage. Also documented some of the ranges. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431152 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5705f8dfd2
commit
a9691a40e8
|
@ -15,7 +15,12 @@ Changes in runtime behavior
|
|||
|
||||
2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now
|
||||
split into CJ and K) in StandardAnalyzer.
|
||||
(John Want and Steven Rowe via Otis Gospodnetic)
|
||||
(John Wang and Steven Rowe via Otis Gospodnetic)
|
||||
|
||||
3. Modified some CJK Unicode code point ranges in StandardTokenizer.jj,
|
||||
and added a few more of them to increase CJK character coverage.
|
||||
Also documented some of the ranges.
|
||||
(Otis Gospodnetic)
|
||||
|
||||
New features
|
||||
|
||||
|
|
|
@ -110,17 +110,32 @@ TOKEN : { // token patterns
|
|||
| < CJ: // Chinese, Japanese
|
||||
[
|
||||
"\u3040"-"\u318f",
|
||||
"\u31f0"-"\u31ff",
|
||||
"\u3100"-"\u312f", // BaPoMoFo (aka ZhuYin)
|
||||
"\u3040"-"\u309F", // Japanese: Hiragana
|
||||
"\u30A0"-"\u30FF", // Japanese: Katakana
|
||||
"\u31F0"-"\u31FF", // Japanese: Katakana Phonetic Extensions
|
||||
"\u3300"-"\u337f",
|
||||
"\u3400"-"\u4db5",
|
||||
"\u3400"-"\u4dbf", // CJK Unified Ideographs Ext. A
|
||||
"\u4e00"-"\u9fff",
|
||||
"\uf900"-"\ufaff",
|
||||
"\uff65"-"\uff9f"
|
||||
|
||||
// Otis: consider adding these, too
|
||||
//
|
||||
// 2E80-2EFF: CJK Radicals Supplement
|
||||
// 2F00-2FDF: Kangxi Radicals
|
||||
// 3190-319F: Kanbun
|
||||
// 31C0-31EF: CJK Strokes
|
||||
// 4E00-9FBF: CJK Unified
|
||||
// F900-FAFF: CJK Compatibility Ideographs
|
||||
|
||||
]
|
||||
>
|
||||
| < KOREAN: // Korean
|
||||
[
|
||||
"\uac00"-"\ud7a3"
|
||||
"\uac00"-"\ud7af", // Hangul Syllables
|
||||
"\u1100"-"\u11ff" // Hangul Jamo
|
||||
// "\uac00"-"\ud7a3"
|
||||
]
|
||||
>
|
||||
| < #DIGIT: // unicode digits
|
||||
|
|
|
@ -53,16 +53,16 @@ static final long[] jjbitVec4 = {
|
|||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec5 = {
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec6 = {
|
||||
0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec7 = {
|
||||
0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||
0x20000L, 0x0L, 0xfffff00000000000L, 0x7fffffL
|
||||
};
|
||||
static final long[] jjbitVec8 = {
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L
|
||||
0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
|
||||
};
|
||||
static final long[] jjbitVec9 = {
|
||||
0xfffffffeL, 0x0L, 0x0L, 0x0L
|
||||
|
|
Loading…
Reference in New Issue