From a9691a40e83237b447a40ab09018ab0f8ae64f12 Mon Sep 17 00:00:00 2001 From: Otis Gospodnetic Date: Sun, 13 Aug 2006 07:47:34 +0000 Subject: [PATCH] - Modified some CJK Unicode code point ranges in StandardTokenizer.jj, and added a few more of them to increase CJK character coverage. Also documented some of the ranges. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@431152 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 7 ++++++- .../analysis/standard/StandardTokenizer.jj | 21 ++++++++++++++++--- .../StandardTokenizerTokenManager.java | 6 +++--- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 1f92db10349..e40186a1e23 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -15,7 +15,12 @@ Changes in runtime behavior 2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now split into CJ and K) in StandardAnalyzer. - (John Want and Steven Rowe via Otis Gospodnetic) + (John Wang and Steven Rowe via Otis Gospodnetic) + + 3. Modified some CJK Unicode code point ranges in StandardTokenizer.jj, + and added a few more of them to increase CJK character coverage. + Also documented some of the ranges. + (Otis Gospodnetic) New features diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj index cdd5f2b1a3c..415517d290d 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj @@ -110,17 +110,32 @@ TOKEN : { // token patterns | < CJ: // Chinese, Japanese [ "\u3040"-"\u318f", - "\u31f0"-"\u31ff", + "\u3100"-"\u312f", // BaPoMoFo (aka ZhuYin) + "\u3040"-"\u309F", // Japanese: Hiragana + "\u30A0"-"\u30FF", // Japanese: Katakana + "\u31F0"-"\u31FF", // Japanese: Katakana Phonetic Extensions "\u3300"-"\u337f", - "\u3400"-"\u4db5", + "\u3400"-"\u4dbf", // CJK Unified Ideographs Ext. A "\u4e00"-"\u9fff", "\uf900"-"\ufaff", "\uff65"-"\uff9f" + +// Otis: consider adding these, too +// +// 2E80-2EFF: CJK Radicals Supplement +// 2F00-2FDF: Kangxi Radicals +// 3190-319F: Kanbun +// 31C0-31EF: CJK Strokes +// 4E00-9FBF: CJK Unified +// F900-FAFF: CJK Compatibility Ideographs + ] > | < KOREAN: // Korean [ - "\uac00"-"\ud7a3" + "\uac00"-"\ud7af", // Hangul Syllables + "\u1100"-"\u11ff" // Hangul Jamo + // "\uac00"-"\ud7a3" ] > | < #DIGIT: // unicode digits diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java index 51ab2a4ea5c..cececb1b96e 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java @@ -53,16 +53,16 @@ static final long[] jjbitVec4 = { 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L }; static final long[] jjbitVec5 = { - 0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L + 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L }; static final long[] jjbitVec6 = { 0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L }; static final long[] jjbitVec7 = { - 0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL + 0x20000L, 0x0L, 0xfffff00000000000L, 0x7fffffL }; static final long[] jjbitVec8 = { - 0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L + 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L }; static final long[] jjbitVec9 = { 0xfffffffeL, 0x0L, 0x0L, 0x0L