diff --git a/CHANGES.txt b/CHANGES.txt index 88b63b68af8..1f92db10349 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -12,7 +12,11 @@ Changes in runtime behavior Note that this problem still exists for 'a', e.g. in 'a-class' as 'a' continues to be a stopword. (Daniel Naber) - + + 2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now + split into CJ and K) in StandardAnalyzer. + (John Want and Steven Rowe via Otis Gospodnetic) + New features 1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers diff --git a/src/demo/org/apache/lucene/demo/html/HTMLParser.java b/src/demo/org/apache/lucene/demo/html/HTMLParser.java index d3641860629..2eb3983d973 100644 --- a/src/demo/org/apache/lucene/demo/html/HTMLParser.java +++ b/src/demo/org/apache/lucene/demo/html/HTMLParser.java @@ -40,6 +40,12 @@ public class HTMLParser implements HTMLParserConstants { } } + /** + * @deprecated Use HTMLParser(FileInputStream) instead + */ + public HTMLParser(File file) throws FileNotFoundException { + this(new FileInputStream(file)); + } public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) diff --git a/src/java/org/apache/lucene/analysis/standard/CharStream.java b/src/java/org/apache/lucene/analysis/standard/CharStream.java index 56065c5d7f1..9b52d8d89b7 100644 --- a/src/java/org/apache/lucene/analysis/standard/CharStream.java +++ b/src/java/org/apache/lucene/analysis/standard/CharStream.java @@ -25,6 +25,20 @@ public interface CharStream { */ char readChar() throws java.io.IOException; + /** + * Returns the column position of the character last read. + * @deprecated + * @see #getEndColumn + */ + int getColumn(); + + /** + * Returns the line number of the character last read. + * @deprecated + * @see #getEndLine + */ + int getLine(); + /** * Returns the column number of the last character for current token (being * matched after the last call to BeginTOken). diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj index 8409ba3f0a3..cdd5f2b1a3c 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj @@ -103,21 +103,24 @@ TOKEN : { // token patterns "\u00c0"-"\u00d6", "\u00d8"-"\u00f6", "\u00f8"-"\u00ff", - "\u0100"-"\u1fff" + "\u0100"-"\u1fff", + "\uffa0"-"\uffdc" ] > | < CJ: // Chinese, Japanese [ "\u3040"-"\u318f", + "\u31f0"-"\u31ff", "\u3300"-"\u337f", - "\u3400"-"\u3d2d", + "\u3400"-"\u4db5", "\u4e00"-"\u9fff", - "\uf900"-"\ufaff" + "\uf900"-"\ufaff", + "\uff65"-"\uff9f" ] > | < KOREAN: // Korean [ - "\uac00"-"\ud7af" + "\uac00"-"\ud7a3" ] > | < #DIGIT: // unicode digits diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java index d8d6ca160de..51ab2a4ea5c 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java @@ -41,54 +41,60 @@ private final void jjCheckNAddStates(int start) jjCheckNAdd(jjnextStates[start + 1]); } static final long[] jjbitVec0 = { - 0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L + 0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L }; static final long[] jjbitVec2 = { 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL }; static final long[] jjbitVec3 = { - 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L + 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L }; static final long[] jjbitVec4 = { 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L }; static final long[] jjbitVec5 = { - 0x3fffffffffffL, 0x0L, 0x0L, 0x0L + 0xffffffffffffffffL, 0xffffffffffffffffL, 0x3fffffffffffffL, 0x0L }; static final long[] jjbitVec6 = { - 0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL + 0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L }; static final long[] jjbitVec7 = { - 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L + 0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL }; static final long[] jjbitVec8 = { - 0xfffffffeL, 0x0L, 0x0L, 0x0L + 0xffffffffffffffffL, 0xffffffffffffffffL, 0xfffffffffL, 0x0L }; static final long[] jjbitVec9 = { - 0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL + 0xfffffffeL, 0x0L, 0x0L, 0x0L }; static final long[] jjbitVec10 = { - 0x1600L, 0x0L, 0x0L, 0x0L + 0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL }; static final long[] jjbitVec11 = { - 0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L + 0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL }; static final long[] jjbitVec12 = { - 0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L + 0x1600L, 0x0L, 0x0L, 0x0L }; static final long[] jjbitVec13 = { - 0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L + 0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L }; static final long[] jjbitVec14 = { - 0x0L, 0xffc000000000L, 0x0L, 0x0L + 0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L }; static final long[] jjbitVec15 = { - 0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L + 0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L }; static final long[] jjbitVec16 = { - 0x0L, 0x3ffL, 0x0L, 0x0L + 0x0L, 0xffc000000000L, 0x0L, 0x0L }; static final long[] jjbitVec17 = { + 0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L +}; +static final long[] jjbitVec18 = { + 0x0L, 0x3ffL, 0x0L, 0x0L +}; +static final long[] jjbitVec19 = { 0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL }; private final int jjMoveNfa_0(int startState, int curPos) @@ -1012,8 +1018,10 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo return ((jjbitVec3[i2] & l2) != 0L); case 51: return ((jjbitVec4[i2] & l2) != 0L); - case 61: + case 77: return ((jjbitVec5[i2] & l2) != 0L); + case 255: + return ((jjbitVec6[i2] & l2) != 0L); default : if ((jjbitVec0[i1] & l1) != 0L) return true; @@ -1025,9 +1033,9 @@ private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, lo switch(hiByte) { case 215: - return ((jjbitVec7[i2] & l2) != 0L); + return ((jjbitVec8[i2] & l2) != 0L); default : - if ((jjbitVec6[i1] & l1) != 0L) + if ((jjbitVec7[i1] & l1) != 0L) return true; return false; } @@ -1037,9 +1045,11 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo switch(hiByte) { case 0: - return ((jjbitVec9[i2] & l2) != 0L); + return ((jjbitVec10[i2] & l2) != 0L); + case 255: + return ((jjbitVec11[i2] & l2) != 0L); default : - if ((jjbitVec8[i1] & l1) != 0L) + if ((jjbitVec9[i1] & l1) != 0L) return true; return false; } @@ -1049,18 +1059,18 @@ private static final boolean jjCanMove_3(int hiByte, int i1, int i2, long l1, lo switch(hiByte) { case 6: - return ((jjbitVec12[i2] & l2) != 0L); - case 11: - return ((jjbitVec13[i2] & l2) != 0L); - case 13: return ((jjbitVec14[i2] & l2) != 0L); - case 14: + case 11: return ((jjbitVec15[i2] & l2) != 0L); - case 16: + case 13: return ((jjbitVec16[i2] & l2) != 0L); + case 14: + return ((jjbitVec17[i2] & l2) != 0L); + case 16: + return ((jjbitVec18[i2] & l2) != 0L); default : - if ((jjbitVec10[i1] & l1) != 0L) - if ((jjbitVec11[i2] & l2) == 0L) + if ((jjbitVec12[i1] & l1) != 0L) + if ((jjbitVec13[i2] & l2) == 0L) return false; else return true; @@ -1072,11 +1082,13 @@ private static final boolean jjCanMove_4(int hiByte, int i1, int i2, long l1, lo switch(hiByte) { case 0: - return ((jjbitVec9[i2] & l2) != 0L); + return ((jjbitVec10[i2] & l2) != 0L); case 215: - return ((jjbitVec7[i2] & l2) != 0L); + return ((jjbitVec8[i2] & l2) != 0L); + case 255: + return ((jjbitVec11[i2] & l2) != 0L); default : - if ((jjbitVec17[i1] & l1) != 0L) + if ((jjbitVec19[i1] & l1) != 0L) return true; return false; }