diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 27aabff7227..813a0faaad5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -535,6 +535,11 @@ Bug fixes suppressed exceptions in the original exception, so stack trace will contain them. (Uwe Schindler) +* LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached + to Han or Hiragana characters, this is fixed if you supply Version >= 3.4 + If you supply a previous lucene version, you get the old buggy behavior + for backwards compatibility. (Trejkaz, Robert Muir) + New Features * LUCENE-3290: Added FieldInvertState.numUniqueTerms diff --git a/modules/analysis/common/build.xml b/modules/analysis/common/build.xml index b9c146633aa..d4c11108c98 100644 --- a/modules/analysis/common/build.xml +++ b/modules/analysis/common/build.xml @@ -67,6 +67,9 @@ + diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index d0287208490..637e06994bd 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -39,6 +39,9 @@ import java.util.Set; *

You must specify the required {@link Version} * compatibility when creating StandardAnalyzer: *

    + *
  • As of 3.4, Hiragana and Han characters are no longer wrongly split + * from their combining characters. If you use a previous version number, + * you get the exact broken behavior for backwards compatibility. *
  • As of 3.1, StandardTokenizer implements Unicode text segmentation, * and StopFilter correctly handles Unicode 4.0 supplementary characters * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index 4051cdfb254..5c0a98c0e89 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -42,6 +43,9 @@ import org.apache.lucene.util.Version; *

    You must specify the required {@link Version} * compatibility when creating StandardTokenizer: *

      + *
    • As of 3.4, Hiragana and Han characters are no longer wrongly split + * from their combining characters. If you use a previous version number, + * you get the exact broken behavior for backwards compatibility. *
    • As of 3.1, StandardTokenizer implements Unicode text segmentation. * If you use a previous version number, you get the exact behavior of * {@link ClassicTokenizer} for backwards compatibility. @@ -142,8 +146,13 @@ public final class StandardTokenizer extends Tokenizer { } private final void init(Reader input, Version matchVersion) { - this.scanner = matchVersion.onOrAfter(Version.LUCENE_31) ? - new StandardTokenizerImpl(input) : new ClassicTokenizerImpl(input); + if (matchVersion.onOrAfter(Version.LUCENE_34)) { + this.scanner = new StandardTokenizerImpl(input); + } else if (matchVersion.onOrAfter(Version.LUCENE_31)) { + this.scanner = new StandardTokenizerImpl31(input); + } else { + this.scanner = new ClassicTokenizerImpl(input); + } this.input = input; } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java index 34e325cd122..2c2d46ac2b5 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 4:07 PM */ package org.apache.lucene.analysis.standard; @@ -209,10 +209,10 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface { private static final String ZZ_ACTION_PACKED_0 = "\1\0\23\1\1\2\1\3\1\4\1\1\1\5\1\6"+ "\1\7\1\10\15\0\1\2\1\0\1\2\10\0\1\3"+ - "\15\0\1\2\57\0"; + "\15\0\1\2\71\0"; private static int [] zzUnpackAction() { - int [] result = new int[114]; + int [] result = new int[124]; int offset = 0; offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); return result; @@ -240,21 +240,22 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface { "\0\0\0\147\0\316\0\u0135\0\u019c\0\u0203\0\u026a\0\u02d1"+ "\0\u0338\0\u039f\0\u0406\0\u046d\0\u04d4\0\u053b\0\u05a2\0\u0609"+ "\0\u0670\0\u06d7\0\u073e\0\u07a5\0\u080c\0\u0873\0\u08da\0\u0941"+ - "\0\u09a8\0\147\0\147\0\u0a0f\0\316\0\u0135\0\u019c\0\u0203"+ - "\0\u026a\0\u0a76\0\u0add\0\u0b44\0\u0bab\0\u046d\0\u0c12\0\u0c79"+ - "\0\u0ce0\0\u0d47\0\u0dae\0\u0e15\0\u0e7c\0\u0338\0\u039f\0\u0ee3"+ - "\0\u0f4a\0\u0fb1\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b"+ - "\0\u1282\0\u12e9\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553"+ - "\0\u15ba\0\u0941\0\u1621\0\u1688\0\u16ef\0\u1756\0\u17bd\0\u1824"+ - "\0\u188b\0\u18f2\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c"+ - "\0\u1bc3\0\u1c2a\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94"+ - "\0\u1efb\0\u1f62\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc"+ - "\0\u2233\0\u229a\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504"+ - "\0\u256b\0\u25d2\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c"+ - "\0\u28a3\0\u290a"; + "\0\u09a8\0\u0a0f\0\u0a76\0\u0add\0\316\0\u0135\0\u019c\0\u0203"+ + "\0\u026a\0\u0b44\0\u0bab\0\u0c12\0\u0c79\0\u046d\0\u0ce0\0\u0d47"+ + "\0\u0dae\0\u0e15\0\u0e7c\0\u0ee3\0\u0f4a\0\u0338\0\u039f\0\u0fb1"+ + "\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b\0\u1282\0\u12e9"+ + "\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553\0\u15ba\0\u1621"+ + "\0\u1688\0\u0941\0\u16ef\0\u1756\0\u17bd\0\u1824\0\u188b\0\u18f2"+ + "\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c\0\u1bc3\0\u1c2a"+ + "\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94\0\u1efb\0\u1f62"+ + "\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc\0\u2233\0\u229a"+ + "\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504\0\u256b\0\u25d2"+ + "\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c\0\u28a3\0\u290a"+ + "\0\u2971\0\u29d8\0\u2a3f\0\u2aa6\0\u2b0d\0\u2b74\0\u2bdb\0\u2c42"+ + "\0\u2ca9\0\u2d10\0\u2d77\0\u2dde"; private static int [] zzUnpackRowMap() { - int [] result = new int[114]; + int [] result = new int[124]; int offset = 0; offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); return result; @@ -367,223 +368,241 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface { "\1\57\3\0\1\75\11\0\1\46\2\0\1\76\16\0"+ "\1\77\2\0\1\100\21\0\1\101\17\0\1\25\1\102"+ "\1\26\1\103\3\0\1\102\1\0\1\102\2\0\1\25"+ - "\142\0\2\31\4\0\1\35\1\0\1\36\1\0\1\37"+ - "\1\0\1\40\1\0\1\41\1\0\1\104\3\0\1\43"+ - "\5\0\1\44\3\0\1\105\11\0\1\46\2\0\1\106"+ - "\16\0\1\107\2\0\1\110\41\0\1\25\1\34\1\52"+ - "\1\0\1\53\1\0\1\53\1\54\1\0\1\34\2\0"+ - "\1\34\2\0\1\25\11\0\3\25\5\0\1\25\1\0"+ - "\1\25\1\0\1\25\4\0\1\25\4\0\1\25\1\0"+ - "\2\25\4\0\1\25\5\0\1\25\3\0\1\25\4\0"+ - "\5\25\10\0\1\52\1\0\2\25\1\0\1\25\10\0"+ - "\1\25\24\0\1\25\1\0\1\52\7\0\2\25\2\0"+ - "\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+ - "\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+ - "\4\25\1\0\5\25\1\52\1\0\1\25\1\0\1\25"+ - "\1\0\3\25\2\0\1\25\1\0\1\25\1\0\1\25"+ - "\2\0\1\25\17\0\1\25\3\0\1\25\5\0\2\25"+ - "\3\0\1\25\4\0\3\25\4\0\1\25\1\0\1\25"+ - "\2\0\1\25\1\0\2\25\4\0\1\25\1\0\1\25"+ - "\3\0\2\25\1\0\1\25\5\0\3\25\1\0\1\25"+ - "\10\0\1\25\1\0\2\52\1\0\1\25\10\0\1\25"+ - "\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+ - "\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+ - "\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+ - "\1\0\3\25\1\0\1\25\1\0\2\25\4\0\3\25"+ - "\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+ - "\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+ - "\1\0\1\25\5\0\3\25\5\0\1\25\2\0\2\25"+ - "\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\25"+ - "\6\0\1\25\56\0\1\25\3\0\1\25\2\0\1\25"+ - "\3\0\1\25\5\0\1\25\7\0\1\25\4\0\2\25"+ - "\3\0\2\25\1\0\1\25\4\0\1\25\1\0\1\25"+ - "\2\0\2\25\1\0\3\25\1\0\1\25\2\0\4\25"+ - "\2\0\1\25\41\0\1\35\1\0\1\36\1\0\1\37"+ - "\1\0\1\40\1\0\1\41\1\0\1\111\3\0\1\43"+ - "\5\0\1\44\3\0\1\112\11\0\1\46\2\0\1\113"+ - "\16\0\1\114\2\0\1\115\41\0\1\25\2\52\2\0"+ - "\2\116\1\54\1\0\1\52\2\0\1\25\1\0\1\35"+ + "\142\0\2\31\16\0\1\104\15\0\1\105\14\0\1\106"+ + "\16\0\1\107\2\0\1\110\42\0\1\32\7\0\1\32"+ + "\16\0\1\111\15\0\1\112\14\0\1\113\16\0\1\114"+ + "\2\0\1\115\42\0\1\33\7\0\1\33\4\0\1\35"+ "\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+ - "\1\0\1\117\3\0\1\120\5\0\1\121\3\0\1\122"+ - "\11\0\1\46\2\0\1\123\16\0\1\124\2\0\1\125"+ - "\41\0\1\25\1\53\7\0\1\53\2\0\1\25\1\0"+ - "\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+ - "\1\41\1\0\1\126\3\0\1\43\5\0\1\44\3\0"+ - "\1\127\11\0\1\46\2\0\1\130\16\0\1\131\2\0"+ - "\1\132\21\0\1\101\17\0\1\25\1\54\1\52\1\103"+ - "\3\0\1\54\1\0\1\54\2\0\1\25\2\0\1\26"+ - "\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+ - "\4\0\1\25\4\0\1\26\1\0\2\26\4\0\1\25"+ - "\5\0\1\25\3\0\1\26\4\0\1\26\2\25\2\26"+ - "\10\0\1\26\1\0\2\25\1\0\1\26\10\0\1\25"+ - "\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+ - "\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+ - "\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+ - "\1\0\3\25\1\0\1\26\1\0\2\25\4\0\3\25"+ - "\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+ - "\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+ - "\1\0\1\25\5\0\1\25\2\26\5\0\1\25\2\0"+ - "\1\25\1\26\4\0\1\25\2\0\1\25\1\0\1\25"+ - "\103\0\2\26\6\0\1\26\56\0\1\26\3\0\1\26"+ - "\2\0\1\26\3\0\1\26\5\0\1\26\7\0\1\26"+ - "\4\0\2\26\3\0\2\26\1\0\1\26\4\0\1\26"+ - "\1\0\1\26\2\0\2\26\1\0\3\26\1\0\1\26"+ - "\2\0\4\26\2\0\1\26\53\0\1\133\3\0\1\134"+ - "\5\0\1\135\3\0\1\136\14\0\1\137\16\0\1\140"+ - "\2\0\1\141\42\0\1\64\1\26\6\0\1\64\4\0"+ - "\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+ - "\1\41\1\0\1\142\3\0\1\56\5\0\1\57\3\0"+ - "\1\143\11\0\1\46\2\0\1\144\16\0\1\145\2\0"+ - "\1\146\21\0\1\101\17\0\1\25\1\65\1\26\1\103"+ - "\3\0\1\65\1\0\1\65\2\0\1\25\2\0\1\27"+ - "\37\0\1\27\1\0\2\27\16\0\1\27\4\0\1\27"+ - "\2\0\2\27\15\0\1\27\132\0\1\27\153\0\2\27"+ - "\11\0\1\27\115\0\2\27\6\0\1\27\56\0\1\27"+ - "\3\0\1\27\2\0\1\27\3\0\1\27\5\0\1\27"+ - "\7\0\1\27\4\0\2\27\3\0\2\27\1\0\1\27"+ - "\4\0\1\27\1\0\1\27\2\0\2\27\1\0\3\27"+ - "\1\0\1\27\2\0\4\27\2\0\1\27\153\0\1\27"+ - "\35\0\1\102\11\0\3\25\5\0\1\25\1\0\1\25"+ - "\1\0\1\25\4\0\1\25\4\0\1\102\1\0\2\102"+ - "\4\0\1\25\5\0\1\25\3\0\1\102\4\0\1\102"+ - "\2\25\2\102\10\0\1\26\1\0\2\25\1\0\1\102"+ - "\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+ - "\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+ - "\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+ - "\1\0\1\25\1\0\3\25\1\0\1\102\1\0\2\25"+ - "\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+ - "\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+ - "\5\0\3\25\1\0\1\25\5\0\1\25\2\102\5\0"+ - "\1\25\2\0\1\25\1\102\4\0\1\25\2\0\1\25"+ - "\1\0\1\25\103\0\2\102\6\0\1\102\56\0\1\102"+ - "\3\0\1\102\2\0\1\102\3\0\1\102\5\0\1\102"+ - "\7\0\1\102\4\0\2\102\3\0\2\102\1\0\1\102"+ - "\4\0\1\102\1\0\1\102\2\0\2\102\1\0\3\102"+ - "\1\0\1\102\2\0\4\102\2\0\1\102\153\0\1\103"+ - "\46\0\1\147\15\0\1\150\14\0\1\151\16\0\1\152"+ - "\2\0\1\153\21\0\1\101\20\0\1\103\1\0\1\103"+ - "\3\0\1\54\1\0\1\103\5\0\1\34\11\0\3\25"+ - "\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+ - "\4\0\1\34\1\0\2\34\4\0\1\25\5\0\1\25"+ - "\3\0\1\34\4\0\1\34\2\25\2\34\10\0\1\52"+ - "\1\0\2\25\1\0\1\34\10\0\1\25\24\0\1\25"+ - "\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+ - "\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+ - "\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+ - "\1\0\1\34\1\0\2\25\4\0\3\25\1\0\1\25"+ - "\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+ - "\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+ - "\5\0\1\25\2\34\5\0\1\25\2\0\1\25\1\34"+ - "\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\34"+ - "\6\0\1\34\56\0\1\34\3\0\1\34\2\0\1\34"+ - "\3\0\1\34\5\0\1\34\7\0\1\34\4\0\2\34"+ - "\3\0\2\34\1\0\1\34\4\0\1\34\1\0\1\34"+ - "\2\0\2\34\1\0\3\34\1\0\1\34\2\0\4\34"+ - "\2\0\1\34\42\0\1\52\11\0\3\25\5\0\1\25"+ - "\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\52"+ - "\1\0\2\52\4\0\1\25\5\0\1\25\3\0\1\52"+ - "\4\0\1\52\2\25\2\52\10\0\1\52\1\0\2\25"+ - "\1\0\1\52\10\0\1\25\24\0\1\25\3\0\1\25"+ + "\1\0\1\116\3\0\1\43\5\0\1\44\3\0\1\117"+ + "\11\0\1\46\2\0\1\120\16\0\1\121\2\0\1\122"+ + "\41\0\1\25\1\34\1\52\1\0\1\53\1\0\1\53"+ + "\1\54\1\0\1\34\2\0\1\34\2\0\1\25\11\0"+ + "\3\25\5\0\1\25\1\0\1\25\1\0\1\25\4\0"+ + "\1\25\4\0\1\25\1\0\2\25\4\0\1\25\5\0"+ + "\1\25\3\0\1\25\4\0\5\25\10\0\1\52\1\0"+ + "\2\25\1\0\1\25\10\0\1\25\24\0\1\25\1\0"+ + "\1\52\7\0\2\25\2\0\5\25\2\0\2\25\4\0"+ + "\6\25\1\0\2\25\4\0\5\25\1\0\5\25\1\0"+ + "\2\25\1\0\3\25\1\0\4\25\1\0\5\25\1\52"+ + "\1\0\1\25\1\0\1\25\1\0\3\25\2\0\1\25"+ + "\1\0\1\25\1\0\1\25\2\0\1\25\17\0\1\25"+ + "\3\0\1\25\5\0\2\25\3\0\1\25\4\0\3\25"+ + "\4\0\1\25\1\0\1\25\2\0\1\25\1\0\2\25"+ + "\4\0\1\25\1\0\1\25\3\0\2\25\1\0\1\25"+ + "\5\0\3\25\1\0\1\25\10\0\1\25\1\0\2\52"+ + "\1\0\1\25\10\0\1\25\24\0\1\25\3\0\1\25"+ "\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+ "\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+ - "\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\52"+ + "\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\25"+ + "\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+ + "\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+ + "\32\0\15\25\5\0\3\25\1\0\1\25\5\0\3\25"+ + "\5\0\1\25\2\0\2\25\4\0\1\25\2\0\1\25"+ + "\1\0\1\25\103\0\2\25\6\0\1\25\56\0\1\25"+ + "\3\0\1\25\2\0\1\25\3\0\1\25\5\0\1\25"+ + "\7\0\1\25\4\0\2\25\3\0\2\25\1\0\1\25"+ + "\4\0\1\25\1\0\1\25\2\0\2\25\1\0\3\25"+ + "\1\0\1\25\2\0\4\25\2\0\1\25\41\0\1\35"+ + "\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+ + "\1\0\1\123\3\0\1\43\5\0\1\44\3\0\1\124"+ + "\11\0\1\46\2\0\1\125\16\0\1\126\2\0\1\127"+ + "\41\0\1\25\2\52\2\0\2\130\1\54\1\0\1\52"+ + "\2\0\1\25\1\0\1\35\1\0\1\36\1\0\1\37"+ + "\1\0\1\40\1\0\1\41\1\0\1\131\3\0\1\132"+ + "\5\0\1\133\3\0\1\134\11\0\1\46\2\0\1\135"+ + "\16\0\1\136\2\0\1\137\41\0\1\25\1\53\7\0"+ + "\1\53\2\0\1\25\1\0\1\35\1\0\1\36\1\0"+ + "\1\37\1\0\1\40\1\0\1\41\1\0\1\140\3\0"+ + "\1\43\5\0\1\44\3\0\1\141\11\0\1\46\2\0"+ + "\1\142\16\0\1\143\2\0\1\144\21\0\1\101\17\0"+ + "\1\25\1\54\1\52\1\103\3\0\1\54\1\0\1\54"+ + "\2\0\1\25\2\0\1\26\11\0\3\25\5\0\1\25"+ + "\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\26"+ + "\1\0\2\26\4\0\1\25\5\0\1\25\3\0\1\26"+ + "\4\0\1\26\2\25\2\26\10\0\1\26\1\0\2\25"+ + "\1\0\1\26\10\0\1\25\24\0\1\25\3\0\1\25"+ + "\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+ + "\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+ + "\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\26"+ "\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+ "\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+ "\32\0\15\25\5\0\3\25\1\0\1\25\5\0\1\25"+ - "\2\52\5\0\1\25\2\0\1\25\1\52\4\0\1\25"+ - "\2\0\1\25\1\0\1\25\103\0\2\52\6\0\1\52"+ - "\56\0\1\52\3\0\1\52\2\0\1\52\3\0\1\52"+ - "\5\0\1\52\7\0\1\52\4\0\2\52\3\0\2\52"+ - "\1\0\1\52\4\0\1\52\1\0\1\52\2\0\2\52"+ - "\1\0\3\52\1\0\1\52\2\0\4\52\2\0\1\52"+ - "\53\0\1\154\3\0\1\155\5\0\1\156\3\0\1\157"+ - "\14\0\1\160\16\0\1\161\2\0\1\162\42\0\1\116"+ - "\1\52\6\0\1\116\5\0\1\53\11\0\3\25\5\0"+ - "\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+ - "\1\53\1\0\2\53\4\0\1\25\5\0\1\25\3\0"+ - "\1\53\4\0\1\53\2\25\2\53\12\0\2\25\1\0"+ - "\1\53\10\0\1\25\24\0\1\25\11\0\2\25\2\0"+ - "\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+ - "\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+ - "\4\25\1\0\5\25\2\0\1\25\1\0\1\25\1\0"+ - "\3\25\2\0\1\25\1\0\1\25\1\0\1\25\2\0"+ - "\1\25\17\0\1\25\3\0\1\25\5\0\2\25\3\0"+ - "\1\25\4\0\3\25\4\0\1\25\1\0\1\25\2\0"+ - "\1\25\1\0\2\25\4\0\1\25\1\0\1\25\3\0"+ - "\2\25\1\0\1\25\5\0\3\25\1\0\1\25\10\0"+ - "\1\25\4\0\1\25\10\0\1\25\24\0\1\25\3\0"+ - "\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+ - "\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+ - "\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+ - "\1\53\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+ - "\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+ - "\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+ - "\1\25\2\53\5\0\1\25\2\0\1\25\1\53\4\0"+ - "\1\25\2\0\1\25\1\0\1\25\103\0\2\53\6\0"+ - "\1\53\56\0\1\53\3\0\1\53\2\0\1\53\3\0"+ - "\1\53\5\0\1\53\7\0\1\53\4\0\2\53\3\0"+ - "\2\53\1\0\1\53\4\0\1\53\1\0\1\53\2\0"+ - "\2\53\1\0\3\53\1\0\1\53\2\0\4\53\2\0"+ - "\1\53\42\0\1\54\11\0\3\25\5\0\1\25\1\0"+ - "\1\25\1\0\1\25\4\0\1\25\4\0\1\54\1\0"+ - "\2\54\4\0\1\25\5\0\1\25\3\0\1\54\4\0"+ - "\1\54\2\25\2\54\10\0\1\52\1\0\2\25\1\0"+ - "\1\54\10\0\1\25\24\0\1\25\3\0\1\25\6\0"+ - "\2\25\5\0\1\25\1\0\1\25\1\0\1\25\1\0"+ - "\11\25\2\0\1\25\4\0\1\25\4\0\6\25\2\0"+ - "\1\25\1\0\1\25\1\0\3\25\1\0\1\54\1\0"+ - "\2\25\4\0\3\25\1\0\1\25\10\0\1\25\1\0"+ - "\2\25\21\0\1\25\3\0\1\25\5\0\1\25\32\0"+ - "\15\25\5\0\3\25\1\0\1\25\5\0\1\25\2\54"+ - "\5\0\1\25\2\0\1\25\1\54\4\0\1\25\2\0"+ - "\1\25\1\0\1\25\103\0\2\54\6\0\1\54\56\0"+ - "\1\54\3\0\1\54\2\0\1\54\3\0\1\54\5\0"+ - "\1\54\7\0\1\54\4\0\2\54\3\0\2\54\1\0"+ - "\1\54\4\0\1\54\1\0\1\54\2\0\2\54\1\0"+ - "\3\54\1\0\1\54\2\0\4\54\2\0\1\54\42\0"+ - "\1\64\37\0\1\64\1\0\2\64\16\0\1\64\4\0"+ - "\1\64\2\0\2\64\10\0\1\26\4\0\1\64\37\0"+ - "\1\26\102\0\1\26\147\0\2\26\134\0\1\64\153\0"+ - "\2\64\11\0\1\64\115\0\2\64\6\0\1\64\56\0"+ - "\1\64\3\0\1\64\2\0\1\64\3\0\1\64\5\0"+ - "\1\64\7\0\1\64\4\0\2\64\3\0\2\64\1\0"+ - "\1\64\4\0\1\64\1\0\1\64\2\0\2\64\1\0"+ - "\3\64\1\0\1\64\2\0\4\64\2\0\1\64\42\0"+ - "\1\65\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+ - "\1\25\4\0\1\25\4\0\1\65\1\0\2\65\4\0"+ - "\1\25\5\0\1\25\3\0\1\65\4\0\1\65\2\25"+ - "\2\65\10\0\1\26\1\0\2\25\1\0\1\65\10\0"+ + "\2\26\5\0\1\25\2\0\1\25\1\26\4\0\1\25"+ + "\2\0\1\25\1\0\1\25\103\0\2\26\6\0\1\26"+ + "\56\0\1\26\3\0\1\26\2\0\1\26\3\0\1\26"+ + "\5\0\1\26\7\0\1\26\4\0\2\26\3\0\2\26"+ + "\1\0\1\26\4\0\1\26\1\0\1\26\2\0\2\26"+ + "\1\0\3\26\1\0\1\26\2\0\4\26\2\0\1\26"+ + "\53\0\1\145\3\0\1\146\5\0\1\147\3\0\1\150"+ + "\14\0\1\151\16\0\1\152\2\0\1\153\42\0\1\64"+ + "\1\26\6\0\1\64\4\0\1\35\1\0\1\36\1\0"+ + "\1\37\1\0\1\40\1\0\1\41\1\0\1\154\3\0"+ + "\1\56\5\0\1\57\3\0\1\155\11\0\1\46\2\0"+ + "\1\156\16\0\1\157\2\0\1\160\21\0\1\101\17\0"+ + "\1\25\1\65\1\26\1\103\3\0\1\65\1\0\1\65"+ + "\2\0\1\25\2\0\1\27\37\0\1\27\1\0\2\27"+ + "\16\0\1\27\4\0\1\27\2\0\2\27\15\0\1\27"+ + "\132\0\1\27\153\0\2\27\11\0\1\27\115\0\2\27"+ + "\6\0\1\27\56\0\1\27\3\0\1\27\2\0\1\27"+ + "\3\0\1\27\5\0\1\27\7\0\1\27\4\0\2\27"+ + "\3\0\2\27\1\0\1\27\4\0\1\27\1\0\1\27"+ + "\2\0\2\27\1\0\3\27\1\0\1\27\2\0\4\27"+ + "\2\0\1\27\153\0\1\27\35\0\1\102\11\0\3\25"+ + "\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+ + "\4\0\1\102\1\0\2\102\4\0\1\25\5\0\1\25"+ + "\3\0\1\102\4\0\1\102\2\25\2\102\10\0\1\26"+ + "\1\0\2\25\1\0\1\102\10\0\1\25\24\0\1\25"+ + "\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+ + "\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+ + "\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+ + "\1\0\1\102\1\0\2\25\4\0\3\25\1\0\1\25"+ + "\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+ + "\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+ + "\5\0\1\25\2\102\5\0\1\25\2\0\1\25\1\102"+ + "\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\102"+ + "\6\0\1\102\56\0\1\102\3\0\1\102\2\0\1\102"+ + "\3\0\1\102\5\0\1\102\7\0\1\102\4\0\2\102"+ + "\3\0\2\102\1\0\1\102\4\0\1\102\1\0\1\102"+ + "\2\0\2\102\1\0\3\102\1\0\1\102\2\0\4\102"+ + "\2\0\1\102\153\0\1\103\46\0\1\161\15\0\1\162"+ + "\14\0\1\163\16\0\1\164\2\0\1\165\21\0\1\101"+ + "\20\0\1\103\1\0\1\103\3\0\1\54\1\0\1\103"+ + "\5\0\1\32\37\0\1\32\1\0\2\32\16\0\1\32"+ + "\4\0\1\32\2\0\2\32\15\0\1\32\132\0\1\32"+ + "\153\0\2\32\11\0\1\32\115\0\2\32\6\0\1\32"+ + "\56\0\1\32\3\0\1\32\2\0\1\32\3\0\1\32"+ + "\5\0\1\32\7\0\1\32\4\0\2\32\3\0\2\32"+ + "\1\0\1\32\4\0\1\32\1\0\1\32\2\0\2\32"+ + "\1\0\3\32\1\0\1\32\2\0\4\32\2\0\1\32"+ + "\42\0\1\33\37\0\1\33\1\0\2\33\16\0\1\33"+ + "\4\0\1\33\2\0\2\33\15\0\1\33\132\0\1\33"+ + "\153\0\2\33\11\0\1\33\115\0\2\33\6\0\1\33"+ + "\56\0\1\33\3\0\1\33\2\0\1\33\3\0\1\33"+ + "\5\0\1\33\7\0\1\33\4\0\2\33\3\0\2\33"+ + "\1\0\1\33\4\0\1\33\1\0\1\33\2\0\2\33"+ + "\1\0\3\33\1\0\1\33\2\0\4\33\2\0\1\33"+ + "\42\0\1\34\11\0\3\25\5\0\1\25\1\0\1\25"+ + "\1\0\1\25\4\0\1\25\4\0\1\34\1\0\2\34"+ + "\4\0\1\25\5\0\1\25\3\0\1\34\4\0\1\34"+ + "\2\25\2\34\10\0\1\52\1\0\2\25\1\0\1\34"+ + "\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+ + "\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+ + "\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+ + "\1\0\1\25\1\0\3\25\1\0\1\34\1\0\2\25"+ + "\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+ + "\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+ + "\5\0\3\25\1\0\1\25\5\0\1\25\2\34\5\0"+ + "\1\25\2\0\1\25\1\34\4\0\1\25\2\0\1\25"+ + "\1\0\1\25\103\0\2\34\6\0\1\34\56\0\1\34"+ + "\3\0\1\34\2\0\1\34\3\0\1\34\5\0\1\34"+ + "\7\0\1\34\4\0\2\34\3\0\2\34\1\0\1\34"+ + "\4\0\1\34\1\0\1\34\2\0\2\34\1\0\3\34"+ + "\1\0\1\34\2\0\4\34\2\0\1\34\42\0\1\52"+ + "\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+ + "\4\0\1\25\4\0\1\52\1\0\2\52\4\0\1\25"+ + "\5\0\1\25\3\0\1\52\4\0\1\52\2\25\2\52"+ + "\10\0\1\52\1\0\2\25\1\0\1\52\10\0\1\25"+ + "\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+ + "\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+ + "\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+ + "\1\0\3\25\1\0\1\52\1\0\2\25\4\0\3\25"+ + "\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+ + "\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+ + "\1\0\1\25\5\0\1\25\2\52\5\0\1\25\2\0"+ + "\1\25\1\52\4\0\1\25\2\0\1\25\1\0\1\25"+ + "\103\0\2\52\6\0\1\52\56\0\1\52\3\0\1\52"+ + "\2\0\1\52\3\0\1\52\5\0\1\52\7\0\1\52"+ + "\4\0\2\52\3\0\2\52\1\0\1\52\4\0\1\52"+ + "\1\0\1\52\2\0\2\52\1\0\3\52\1\0\1\52"+ + "\2\0\4\52\2\0\1\52\53\0\1\166\3\0\1\167"+ + "\5\0\1\170\3\0\1\171\14\0\1\172\16\0\1\173"+ + "\2\0\1\174\42\0\1\130\1\52\6\0\1\130\5\0"+ + "\1\53\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+ + "\1\25\4\0\1\25\4\0\1\53\1\0\2\53\4\0"+ + "\1\25\5\0\1\25\3\0\1\53\4\0\1\53\2\25"+ + "\2\53\12\0\2\25\1\0\1\53\10\0\1\25\24\0"+ + "\1\25\11\0\2\25\2\0\5\25\2\0\2\25\4\0"+ + "\6\25\1\0\2\25\4\0\5\25\1\0\5\25\1\0"+ + "\2\25\1\0\3\25\1\0\4\25\1\0\5\25\2\0"+ + "\1\25\1\0\1\25\1\0\3\25\2\0\1\25\1\0"+ + "\1\25\1\0\1\25\2\0\1\25\17\0\1\25\3\0"+ + "\1\25\5\0\2\25\3\0\1\25\4\0\3\25\4\0"+ + "\1\25\1\0\1\25\2\0\1\25\1\0\2\25\4\0"+ + "\1\25\1\0\1\25\3\0\2\25\1\0\1\25\5\0"+ + "\3\25\1\0\1\25\10\0\1\25\4\0\1\25\10\0"+ "\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+ "\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+ "\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+ - "\1\25\1\0\3\25\1\0\1\65\1\0\2\25\4\0"+ + "\1\25\1\0\3\25\1\0\1\53\1\0\2\25\4\0"+ "\3\25\1\0\1\25\10\0\1\25\1\0\2\25\21\0"+ "\1\25\3\0\1\25\5\0\1\25\32\0\15\25\5\0"+ - "\3\25\1\0\1\25\5\0\1\25\2\65\5\0\1\25"+ - "\2\0\1\25\1\65\4\0\1\25\2\0\1\25\1\0"+ - "\1\25\103\0\2\65\6\0\1\65\56\0\1\65\3\0"+ - "\1\65\2\0\1\65\3\0\1\65\5\0\1\65\7\0"+ - "\1\65\4\0\2\65\3\0\2\65\1\0\1\65\4\0"+ - "\1\65\1\0\1\65\2\0\2\65\1\0\3\65\1\0"+ - "\1\65\2\0\4\65\2\0\1\65\42\0\1\103\37\0"+ - "\1\103\1\0\2\103\16\0\1\103\4\0\1\103\2\0"+ - "\2\103\15\0\1\103\132\0\1\103\153\0\2\103\11\0"+ - "\1\103\115\0\2\103\6\0\1\103\56\0\1\103\3\0"+ - "\1\103\2\0\1\103\3\0\1\103\5\0\1\103\7\0"+ - "\1\103\4\0\2\103\3\0\2\103\1\0\1\103\4\0"+ - "\1\103\1\0\1\103\2\0\2\103\1\0\3\103\1\0"+ - "\1\103\2\0\4\103\2\0\1\103\42\0\1\116\37\0"+ - "\1\116\1\0\2\116\16\0\1\116\4\0\1\116\2\0"+ - "\2\116\10\0\1\52\4\0\1\116\37\0\1\52\102\0"+ - "\1\52\147\0\2\52\134\0\1\116\153\0\2\116\11\0"+ - "\1\116\115\0\2\116\6\0\1\116\56\0\1\116\3\0"+ - "\1\116\2\0\1\116\3\0\1\116\5\0\1\116\7\0"+ - "\1\116\4\0\2\116\3\0\2\116\1\0\1\116\4\0"+ - "\1\116\1\0\1\116\2\0\2\116\1\0\3\116\1\0"+ - "\1\116\2\0\4\116\2\0\1\116\40\0"; + "\3\25\1\0\1\25\5\0\1\25\2\53\5\0\1\25"+ + "\2\0\1\25\1\53\4\0\1\25\2\0\1\25\1\0"+ + "\1\25\103\0\2\53\6\0\1\53\56\0\1\53\3\0"+ + "\1\53\2\0\1\53\3\0\1\53\5\0\1\53\7\0"+ + "\1\53\4\0\2\53\3\0\2\53\1\0\1\53\4\0"+ + "\1\53\1\0\1\53\2\0\2\53\1\0\3\53\1\0"+ + "\1\53\2\0\4\53\2\0\1\53\42\0\1\54\11\0"+ + "\3\25\5\0\1\25\1\0\1\25\1\0\1\25\4\0"+ + "\1\25\4\0\1\54\1\0\2\54\4\0\1\25\5\0"+ + "\1\25\3\0\1\54\4\0\1\54\2\25\2\54\10\0"+ + "\1\52\1\0\2\25\1\0\1\54\10\0\1\25\24\0"+ + "\1\25\3\0\1\25\6\0\2\25\5\0\1\25\1\0"+ + "\1\25\1\0\1\25\1\0\11\25\2\0\1\25\4\0"+ + "\1\25\4\0\6\25\2\0\1\25\1\0\1\25\1\0"+ + "\3\25\1\0\1\54\1\0\2\25\4\0\3\25\1\0"+ + "\1\25\10\0\1\25\1\0\2\25\21\0\1\25\3\0"+ + "\1\25\5\0\1\25\32\0\15\25\5\0\3\25\1\0"+ + "\1\25\5\0\1\25\2\54\5\0\1\25\2\0\1\25"+ + "\1\54\4\0\1\25\2\0\1\25\1\0\1\25\103\0"+ + "\2\54\6\0\1\54\56\0\1\54\3\0\1\54\2\0"+ + "\1\54\3\0\1\54\5\0\1\54\7\0\1\54\4\0"+ + "\2\54\3\0\2\54\1\0\1\54\4\0\1\54\1\0"+ + "\1\54\2\0\2\54\1\0\3\54\1\0\1\54\2\0"+ + "\4\54\2\0\1\54\42\0\1\64\37\0\1\64\1\0"+ + "\2\64\16\0\1\64\4\0\1\64\2\0\2\64\10\0"+ + "\1\26\4\0\1\64\37\0\1\26\102\0\1\26\147\0"+ + "\2\26\134\0\1\64\153\0\2\64\11\0\1\64\115\0"+ + "\2\64\6\0\1\64\56\0\1\64\3\0\1\64\2\0"+ + "\1\64\3\0\1\64\5\0\1\64\7\0\1\64\4\0"+ + "\2\64\3\0\2\64\1\0\1\64\4\0\1\64\1\0"+ + "\1\64\2\0\2\64\1\0\3\64\1\0\1\64\2\0"+ + "\4\64\2\0\1\64\42\0\1\65\11\0\3\25\5\0"+ + "\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+ + "\1\65\1\0\2\65\4\0\1\25\5\0\1\25\3\0"+ + "\1\65\4\0\1\65\2\25\2\65\10\0\1\26\1\0"+ + "\2\25\1\0\1\65\10\0\1\25\24\0\1\25\3\0"+ + "\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+ + "\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+ + "\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+ + "\1\65\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+ + "\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+ + "\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+ + "\1\25\2\65\5\0\1\25\2\0\1\25\1\65\4\0"+ + "\1\25\2\0\1\25\1\0\1\25\103\0\2\65\6\0"+ + "\1\65\56\0\1\65\3\0\1\65\2\0\1\65\3\0"+ + "\1\65\5\0\1\65\7\0\1\65\4\0\2\65\3\0"+ + "\2\65\1\0\1\65\4\0\1\65\1\0\1\65\2\0"+ + "\2\65\1\0\3\65\1\0\1\65\2\0\4\65\2\0"+ + "\1\65\42\0\1\103\37\0\1\103\1\0\2\103\16\0"+ + "\1\103\4\0\1\103\2\0\2\103\15\0\1\103\132\0"+ + "\1\103\153\0\2\103\11\0\1\103\115\0\2\103\6\0"+ + "\1\103\56\0\1\103\3\0\1\103\2\0\1\103\3\0"+ + "\1\103\5\0\1\103\7\0\1\103\4\0\2\103\3\0"+ + "\2\103\1\0\1\103\4\0\1\103\1\0\1\103\2\0"+ + "\2\103\1\0\3\103\1\0\1\103\2\0\4\103\2\0"+ + "\1\103\42\0\1\130\37\0\1\130\1\0\2\130\16\0"+ + "\1\130\4\0\1\130\2\0\2\130\10\0\1\52\4\0"+ + "\1\130\37\0\1\52\102\0\1\52\147\0\2\52\134\0"+ + "\1\130\153\0\2\130\11\0\1\130\115\0\2\130\6\0"+ + "\1\130\56\0\1\130\3\0\1\130\2\0\1\130\3\0"+ + "\1\130\5\0\1\130\7\0\1\130\4\0\2\130\3\0"+ + "\2\130\1\0\1\130\4\0\1\130\1\0\1\130\2\0"+ + "\2\130\1\0\3\130\1\0\1\130\2\0\4\130\2\0"+ + "\1\130\40\0"; private static int [] zzUnpackTrans() { - int [] result = new int[10609]; + int [] result = new int[11845]; int offset = 0; offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); return result; @@ -621,11 +640,11 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface { private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); private static final String ZZ_ATTRIBUTE_PACKED_0 = - "\1\0\1\11\27\1\2\11\1\1\15\0\1\1\1\0"+ - "\1\1\10\0\1\1\15\0\1\1\57\0"; + "\1\0\1\11\32\1\15\0\1\1\1\0\1\1\10\0"+ + "\1\1\15\0\1\1\71\0"; private static int [] zzUnpackAttribute() { - int [] result = new int[114]; + int [] result = new int[124]; int offset = 0; offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); return result; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex index 219488375f0..e624074b2f8 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex @@ -71,6 +71,8 @@ MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})* MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})* ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})* +HanEx = {Han} ({Format} | {Extend})* +HiraganaEx = {Hiragana} ({Format} | {Extend})* %{ /** Alphanumeric sequences */ @@ -178,8 +180,8 @@ ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})* // UAX#29 WB14. Any ÷ Any // -{Han} { return IDEOGRAPHIC_TYPE; } -{Hiragana} { return HIRAGANA_TYPE; } +{HanEx} { return IDEOGRAPHIC_TYPE; } +{HiraganaEx} { return HIRAGANA_TYPE; } // UAX#29 WB3. CR × LF diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java index f78cee30c74..eb2f0d7d161 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java @@ -22,7 +22,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.io.Reader; import java.io.IOException; -interface StandardTokenizerInterface { +/** @lucene.internal */ +public interface StandardTokenizerInterface { /** This character denotes the end of file */ public static final int YYEOF = -1; @@ -30,12 +31,12 @@ interface StandardTokenizerInterface { /** * Copies the matched text into the CharTermAttribute */ - void getText(CharTermAttribute t); + public void getText(CharTermAttribute t); /** * Returns the current position. */ - int yychar(); + public int yychar(); /** * Resets the scanner to read from a new input stream. @@ -47,12 +48,12 @@ interface StandardTokenizerInterface { * * @param reader the new input stream */ - void yyreset(Reader reader); + public void yyreset(Reader reader); /** * Returns the length of the matched text region. */ - int yylength(); + public int yylength(); /** * Resumes scanning until the next regular expression is matched, @@ -61,6 +62,6 @@ interface StandardTokenizerInterface { * @return the next token, {@link #YYEOF} on end of stream * @exception IOException if any I/O-Error occurs */ - int getNextToken() throws IOException; + public int getNextToken() throws IOException; } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java new file mode 100644 index 00000000000..3a9eadee383 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java @@ -0,0 +1,1089 @@ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 4:07 PM */ + +package org.apache.lucene.analysis.standard.std31; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizerInterface; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * This class implements StandardTokenizer, except with a bug + * (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana + * characters would be split from combining characters: + * @deprecated This class is only for exact backwards compatibility + */ +@Deprecated + +/** + * This class is a scanner generated by + * JFlex 1.5.0-SNAPSHOT + * on 8/4/11 4:07 PM from the specification file + * /home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex + */ +public final class StandardTokenizerImpl31 implements StandardTokenizerInterface { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\47\0\1\140\4\0\1\137\1\0\1\140\1\0\12\134\1\136\1\137"+ + "\5\0\32\132\4\0\1\141\1\0\32\132\57\0\1\132\2\0\1\133"+ + "\7\0\1\132\1\0\1\136\2\0\1\132\5\0\27\132\1\0\37\132"+ + "\1\0\u01ca\132\4\0\14\132\16\0\5\132\7\0\1\132\1\0\1\132"+ + "\21\0\160\133\5\132\1\0\2\132\2\0\4\132\1\137\7\0\1\132"+ + "\1\136\3\132\1\0\1\132\1\0\24\132\1\0\123\132\1\0\213\132"+ + "\1\0\7\133\236\132\11\0\46\132\2\0\1\132\7\0\47\132\1\0"+ + "\1\137\7\0\55\133\1\0\1\133\1\0\2\133\1\0\2\133\1\0"+ + "\1\133\10\0\33\132\5\0\4\132\1\136\13\0\4\133\10\0\2\137"+ + "\2\0\13\133\5\0\53\132\25\133\12\134\1\0\1\134\1\137\1\0"+ + "\2\132\1\133\143\132\1\0\1\132\7\133\1\133\1\0\6\133\2\132"+ + "\2\133\1\0\4\133\2\132\12\134\3\132\2\0\1\132\17\0\1\133"+ + "\1\132\1\133\36\132\33\133\2\0\131\132\13\133\1\132\16\0\12\134"+ + "\41\132\11\133\2\132\2\0\1\137\1\0\1\132\5\0\26\132\4\133"+ + "\1\132\11\133\1\132\3\133\1\132\5\133\22\0\31\132\3\133\244\0"+ + "\4\133\66\132\3\133\1\132\22\133\1\132\7\133\12\132\2\133\2\0"+ + "\12\134\1\0\7\132\1\0\7\132\1\0\3\133\1\0\10\132\2\0"+ + "\2\132\2\0\26\132\1\0\7\132\1\0\1\132\3\0\4\132\2\0"+ + "\1\133\1\132\7\133\2\0\2\133\2\0\3\133\1\132\10\0\1\133"+ + "\4\0\2\132\1\0\3\132\2\133\2\0\12\134\2\132\17\0\3\133"+ + "\1\0\6\132\4\0\2\132\2\0\26\132\1\0\7\132\1\0\2\132"+ + "\1\0\2\132\1\0\2\132\2\0\1\133\1\0\5\133\4\0\2\133"+ + "\2\0\3\133\3\0\1\133\7\0\4\132\1\0\1\132\7\0\12\134"+ + "\2\133\3\132\1\133\13\0\3\133\1\0\11\132\1\0\3\132\1\0"+ + "\26\132\1\0\7\132\1\0\2\132\1\0\5\132\2\0\1\133\1\132"+ + "\10\133\1\0\3\133\1\0\3\133\2\0\1\132\17\0\2\132\2\133"+ + "\2\0\12\134\21\0\3\133\1\0\10\132\2\0\2\132\2\0\26\132"+ + "\1\0\7\132\1\0\2\132\1\0\5\132\2\0\1\133\1\132\7\133"+ + "\2\0\2\133\2\0\3\133\10\0\2\133\4\0\2\132\1\0\3\132"+ + "\2\133\2\0\12\134\1\0\1\132\20\0\1\133\1\132\1\0\6\132"+ + "\3\0\3\132\1\0\4\132\3\0\2\132\1\0\1\132\1\0\2\132"+ + "\3\0\2\132\3\0\3\132\3\0\14\132\4\0\5\133\3\0\3\133"+ + "\1\0\4\133\2\0\1\132\6\0\1\133\16\0\12\134\21\0\3\133"+ + "\1\0\10\132\1\0\3\132\1\0\27\132\1\0\12\132\1\0\5\132"+ + "\3\0\1\132\7\133\1\0\3\133\1\0\4\133\7\0\2\133\1\0"+ + "\2\132\6\0\2\132\2\133\2\0\12\134\22\0\2\133\1\0\10\132"+ + "\1\0\3\132\1\0\27\132\1\0\12\132\1\0\5\132\2\0\1\133"+ + "\1\132\7\133\1\0\3\133\1\0\4\133\7\0\2\133\7\0\1\132"+ + "\1\0\2\132\2\133\2\0\12\134\1\0\2\132\17\0\2\133\1\0"+ + "\10\132\1\0\3\132\1\0\51\132\2\0\1\132\7\133\1\0\3\133"+ + "\1\0\4\133\1\132\10\0\1\133\10\0\2\132\2\133\2\0\12\134"+ + "\12\0\6\132\2\0\2\133\1\0\22\132\3\0\30\132\1\0\11\132"+ + "\1\0\1\132\2\0\7\132\3\0\1\133\4\0\6\133\1\0\1\133"+ + "\1\0\10\133\22\0\2\133\15\0\60\142\1\143\2\142\7\143\5\0"+ + "\7\142\10\143\1\0\12\134\47\0\2\142\1\0\1\142\2\0\2\142"+ + "\1\0\1\142\2\0\1\142\6\0\4\142\1\0\7\142\1\0\3\142"+ + "\1\0\1\142\1\0\1\142\2\0\2\142\1\0\4\142\1\143\2\142"+ + "\6\143\1\0\2\143\1\142\2\0\5\142\1\0\1\142\1\0\6\143"+ + "\2\0\12\134\2\0\2\142\42\0\1\132\27\0\2\133\6\0\12\134"+ + "\13\0\1\133\1\0\1\133\1\0\1\133\4\0\2\133\10\132\1\0"+ + "\44\132\4\0\24\133\1\0\2\133\5\132\13\133\1\0\44\133\11\0"+ + "\1\133\71\0\53\142\24\143\1\142\12\134\6\0\6\142\4\143\4\142"+ + "\3\143\1\142\3\143\2\142\7\143\3\142\4\143\15\142\14\143\1\142"+ + "\1\143\12\134\4\143\2\142\46\132\12\0\53\132\1\0\1\132\3\0"+ + "\u0100\146\111\132\1\0\4\132\2\0\7\132\1\0\1\132\1\0\4\132"+ + "\2\0\51\132\1\0\4\132\2\0\41\132\1\0\4\132\2\0\7\132"+ + "\1\0\1\132\1\0\4\132\2\0\17\132\1\0\71\132\1\0\4\132"+ + "\2\0\103\132\2\0\3\133\40\0\20\132\20\0\125\132\14\0\u026c\132"+ + "\2\0\21\132\1\0\32\132\5\0\113\132\3\0\3\132\17\0\15\132"+ + "\1\0\4\132\3\133\13\0\22\132\3\133\13\0\22\132\2\133\14\0"+ + "\15\132\1\0\3\132\1\0\2\133\14\0\64\142\2\143\36\143\3\0"+ + "\1\142\4\0\1\142\1\143\2\0\12\134\41\0\3\133\2\0\12\134"+ + "\6\0\130\132\10\0\51\132\1\133\1\132\5\0\106\132\12\0\35\132"+ + "\3\0\14\133\4\0\14\133\12\0\12\134\36\142\2\0\5\142\13\0"+ + "\54\142\4\0\21\143\7\142\2\143\6\0\12\134\1\142\3\0\2\142"+ + "\40\0\27\132\5\133\4\0\65\142\12\143\1\0\35\143\2\0\1\133"+ + "\12\134\6\0\12\134\6\0\16\142\122\0\5\133\57\132\21\133\7\132"+ + "\4\0\12\134\21\0\11\133\14\0\3\133\36\132\12\133\3\0\2\132"+ + "\12\134\6\0\46\132\16\133\14\0\44\132\24\133\10\0\12\134\3\0"+ + "\3\132\12\134\44\132\122\0\3\133\1\0\25\133\4\132\1\133\4\132"+ + "\1\133\15\0\300\132\47\133\25\0\4\133\u0116\132\2\0\6\132\2\0"+ + "\46\132\2\0\6\132\2\0\10\132\1\0\1\132\1\0\1\132\1\0"+ + "\1\132\1\0\37\132\2\0\65\132\1\0\7\132\1\0\1\132\3\0"+ + "\3\132\1\0\7\132\3\0\4\132\2\0\6\132\4\0\15\132\5\0"+ + "\3\132\1\0\7\132\17\0\2\133\2\133\10\0\2\140\12\0\1\140"+ + "\2\0\1\136\2\0\5\133\20\0\2\141\3\0\1\137\17\0\1\141"+ + "\13\0\5\133\5\0\6\133\1\0\1\132\15\0\1\132\20\0\15\132"+ + "\63\0\41\133\21\0\1\132\4\0\1\132\2\0\12\132\1\0\1\132"+ + "\3\0\5\132\6\0\1\132\1\0\1\132\1\0\1\132\1\0\4\132"+ + "\1\0\13\132\2\0\4\132\5\0\5\132\4\0\1\132\21\0\51\132"+ + "\u032d\0\64\132\u0716\0\57\132\1\0\57\132\1\0\205\132\6\0\4\132"+ + "\3\133\16\0\46\132\12\0\66\132\11\0\1\132\17\0\1\133\27\132"+ + "\11\0\7\132\1\0\7\132\1\0\7\132\1\0\7\132\1\0\7\132"+ + "\1\0\7\132\1\0\7\132\1\0\7\132\1\0\40\133\57\0\1\132"+ + "\120\0\32\144\1\0\131\144\14\0\326\144\57\0\1\132\1\0\1\144"+ + "\31\0\11\144\4\133\2\133\1\0\5\135\2\0\3\144\1\132\1\132"+ + "\4\0\126\145\2\0\2\133\2\135\3\145\133\135\1\0\4\135\5\0"+ + "\51\132\3\0\136\146\21\0\33\132\65\0\20\135\37\0\101\0\37\0"+ + "\121\0\57\135\1\0\130\135\250\0\u19b6\144\112\0\u51cc\144\64\0\u048d\132"+ + "\103\0\56\132\2\0\u010d\132\3\0\20\132\12\134\2\132\24\0\57\132"+ + "\4\133\11\0\2\133\1\0\31\132\10\0\120\132\2\133\45\0\11\132"+ + "\2\0\147\132\2\0\4\132\1\0\2\132\16\0\12\132\120\0\10\132"+ + "\1\133\3\132\1\133\4\132\1\133\27\132\5\133\30\0\64\132\14\0"+ + "\2\133\62\132\21\133\13\0\12\134\6\0\22\133\6\132\3\0\1\132"+ + "\4\0\12\134\34\132\10\133\2\0\27\132\15\133\14\0\35\146\3\0"+ + "\4\133\57\132\16\133\16\0\1\132\12\134\46\0\51\132\16\133\11\0"+ + "\3\132\1\133\10\132\2\133\2\0\12\134\6\0\33\142\1\143\4\0"+ + "\60\142\1\143\1\142\3\143\2\142\2\143\5\142\2\143\1\142\1\143"+ + "\1\142\30\0\5\142\41\0\6\132\2\0\6\132\2\0\6\132\11\0"+ + "\7\132\1\0\7\132\221\0\43\132\10\133\1\0\2\133\2\0\12\134"+ + "\6\0\u2ba4\146\14\0\27\146\4\0\61\146\4\0\1\31\1\25\1\46"+ + "\1\43\1\13\3\0\1\7\1\5\2\0\1\3\1\1\14\0\1\11"+ + "\21\0\1\112\7\0\1\65\1\17\6\0\1\130\3\0\1\120\1\120"+ + "\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120"+ + "\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120"+ + "\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120"+ + "\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\120\1\121"+ + "\1\120\1\120\1\120\1\125\1\123\17\0\1\114\u02c1\0\1\70\277\0"+ + "\1\113\1\71\1\2\3\124\2\35\1\124\1\35\2\124\1\14\21\124"+ + "\2\60\7\73\1\72\7\73\7\52\1\15\1\52\1\75\2\45\1\44"+ + "\1\75\1\45\1\44\10\75\2\63\5\61\2\54\5\61\1\6\10\37"+ + "\5\21\3\27\12\106\20\27\3\42\32\30\1\26\2\24\2\110\1\111"+ + "\2\110\2\111\2\110\1\111\3\24\1\16\2\24\12\64\1\74\1\41"+ + "\1\34\1\64\6\41\1\34\66\41\5\115\6\103\1\51\4\103\2\51"+ + "\10\103\1\51\7\100\1\12\2\100\32\103\1\12\4\100\1\12\5\102"+ + "\1\101\1\102\3\101\7\102\1\101\23\102\5\67\3\102\6\67\2\67"+ + "\6\66\10\66\2\100\7\66\36\100\4\66\102\100\15\115\1\77\2\115"+ + "\1\131\3\117\1\115\2\117\5\115\4\117\4\116\1\115\3\116\1\115"+ + "\5\116\26\56\4\23\1\105\2\104\4\122\1\104\2\122\3\76\33\122"+ + "\35\55\3\122\35\126\3\122\6\126\2\33\31\126\1\33\17\126\6\122"+ + "\4\22\1\10\37\22\1\10\4\22\25\62\1\127\11\62\21\55\5\62"+ + "\1\57\12\40\13\62\4\55\1\50\6\55\12\122\17\55\1\47\3\53"+ + "\15\20\11\36\1\32\24\36\2\20\11\36\1\32\31\36\1\32\4\20"+ + "\4\36\2\32\2\107\1\4\5\107\52\4\u1900\0\u012e\144\2\0\76\144"+ + "\2\0\152\144\46\0\7\132\14\0\5\132\5\0\1\132\1\133\12\132"+ + "\1\0\15\132\1\0\5\132\1\0\1\132\1\0\2\132\1\0\2\132"+ + "\1\0\154\132\41\0\u016b\132\22\0\100\132\2\0\66\132\50\0\14\132"+ + "\4\0\20\133\1\137\2\0\1\136\1\137\13\0\7\133\14\0\2\141"+ + "\30\0\3\141\1\137\1\0\1\140\1\0\1\137\1\136\32\0\5\132"+ + "\1\0\207\132\2\0\1\133\7\0\1\140\4\0\1\137\1\0\1\140"+ + "\1\0\12\134\1\136\1\137\5\0\32\132\4\0\1\141\1\0\32\132"+ + "\13\0\70\135\2\133\37\146\3\0\6\146\2\0\6\146\2\0\6\146"+ + "\2\0\3\146\34\0\3\133\4\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\23\1\1\2\1\3\1\4\1\1\1\5\1\6"+ + "\1\7\1\10\15\0\1\2\1\0\1\2\10\0\1\3"+ + "\15\0\1\2\57\0"; + + private static int [] zzUnpackAction() { + int [] result = new int[114]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\147\0\316\0\u0135\0\u019c\0\u0203\0\u026a\0\u02d1"+ + "\0\u0338\0\u039f\0\u0406\0\u046d\0\u04d4\0\u053b\0\u05a2\0\u0609"+ + "\0\u0670\0\u06d7\0\u073e\0\u07a5\0\u080c\0\u0873\0\u08da\0\u0941"+ + "\0\u09a8\0\147\0\147\0\u0a0f\0\316\0\u0135\0\u019c\0\u0203"+ + "\0\u026a\0\u0a76\0\u0add\0\u0b44\0\u0bab\0\u046d\0\u0c12\0\u0c79"+ + "\0\u0ce0\0\u0d47\0\u0dae\0\u0e15\0\u0e7c\0\u0338\0\u039f\0\u0ee3"+ + "\0\u0f4a\0\u0fb1\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b"+ + "\0\u1282\0\u12e9\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553"+ + "\0\u15ba\0\u0941\0\u1621\0\u1688\0\u16ef\0\u1756\0\u17bd\0\u1824"+ + "\0\u188b\0\u18f2\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c"+ + "\0\u1bc3\0\u1c2a\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94"+ + "\0\u1efb\0\u1f62\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc"+ + "\0\u2233\0\u229a\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504"+ + "\0\u256b\0\u25d2\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c"+ + "\0\u28a3\0\u290a"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[114]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\2\1\5\1\2\1\6"+ + "\1\2\1\7\1\2\1\10\3\2\1\11\5\2\1\12"+ + "\3\2\1\13\11\2\1\14\2\2\1\15\43\2\1\16"+ + "\1\2\1\17\3\2\1\20\1\21\1\2\1\22\1\2"+ + "\1\23\2\2\1\24\1\2\1\25\1\2\1\26\1\27"+ + "\3\2\1\30\2\31\1\32\1\33\1\34\151\0\1\25"+ + "\11\0\1\25\20\0\1\25\22\0\1\25\10\0\3\25"+ + "\17\0\1\25\10\0\1\25\24\0\1\25\1\0\1\25"+ + "\1\0\1\25\1\0\1\25\1\0\1\25\1\0\3\25"+ + "\1\0\5\25\1\0\3\25\1\0\11\25\1\0\2\25"+ + "\1\0\16\25\1\0\2\25\1\0\21\25\1\0\1\25"+ + "\1\0\3\25\2\0\1\25\1\0\1\25\1\0\2\25"+ + "\1\0\1\25\17\0\1\25\3\0\1\25\5\0\2\25"+ + "\3\0\1\25\13\0\1\25\1\0\1\25\4\0\2\25"+ + "\4\0\1\25\1\0\1\25\3\0\2\25\1\0\1\25"+ + "\5\0\3\25\1\0\1\25\15\0\1\25\10\0\1\25"+ + "\24\0\1\25\3\0\1\25\1\0\1\25\1\0\1\25"+ + "\1\0\3\25\2\0\4\25\1\0\3\25\2\0\3\25"+ + "\1\0\4\25\1\0\2\25\2\0\3\25\1\0\11\25"+ + "\1\0\2\25\1\0\16\25\1\0\2\25\1\0\1\25"+ + "\1\0\3\25\2\0\1\25\1\0\1\25\1\0\2\25"+ + "\1\0\1\25\17\0\1\25\3\0\1\25\3\0\1\25"+ + "\1\0\3\25\2\0\1\25\1\0\2\25\1\0\3\25"+ + "\3\0\2\25\1\0\1\25\1\0\2\25\1\0\2\25"+ + "\3\0\2\25\1\0\1\25\1\0\1\25\1\0\2\25"+ + "\1\0\2\25\1\0\2\25\1\0\5\25\1\0\5\25"+ + "\1\0\2\25\1\0\2\25\1\0\1\25\1\0\3\25"+ + "\4\0\1\25\4\0\1\25\31\0\3\25\5\0\1\25"+ + "\1\0\1\25\1\0\1\25\4\0\1\25\14\0\1\25"+ + "\5\0\1\25\11\0\2\25\12\0\1\26\1\0\2\25"+ + "\12\0\1\25\24\0\1\25\1\0\1\26\7\0\2\25"+ + "\2\0\5\25\2\0\2\25\4\0\6\25\1\0\2\25"+ + "\4\0\5\25\1\0\5\25\1\0\2\25\1\0\3\25"+ + "\1\0\4\25\1\0\5\25\1\26\1\0\1\25\1\0"+ + "\1\25\1\0\3\25\2\0\1\25\1\0\1\25\1\0"+ + "\1\25\2\0\1\25\17\0\1\25\3\0\1\25\5\0"+ + "\2\25\3\0\1\25\4\0\3\25\4\0\1\25\1\0"+ + "\1\25\2\0\1\25\1\0\2\25\4\0\1\25\1\0"+ + "\1\25\3\0\2\25\1\0\1\25\5\0\3\25\1\0"+ + "\1\25\10\0\1\25\1\0\2\26\1\0\1\25\10\0"+ + "\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+ + "\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+ + "\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+ + "\1\25\1\0\3\25\3\0\2\25\4\0\3\25\1\0"+ + "\1\25\10\0\1\25\1\0\2\25\21\0\1\25\11\0"+ + "\2\25\17\0\1\25\6\0\2\25\4\0\1\25\5\0"+ + "\1\25\2\0\1\25\5\0\3\25\1\0\1\25\15\0"+ + "\1\25\10\0\1\25\24\0\1\25\3\0\1\25\5\0"+ + "\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+ + "\1\25\7\0\1\25\2\0\1\25\5\0\1\25\2\0"+ + "\1\25\1\0\1\25\106\0\1\33\21\0\1\27\35\0"+ + "\1\32\3\0\1\32\3\0\1\32\1\0\3\32\2\0"+ + "\1\32\2\0\1\32\1\0\3\32\3\0\2\32\1\0"+ + "\1\32\1\0\2\32\1\0\2\32\3\0\2\32\1\0"+ + "\1\32\3\0\2\32\1\0\2\32\1\0\2\32\1\0"+ + "\5\32\1\0\5\32\2\0\1\32\1\0\2\32\1\0"+ + "\1\32\1\0\3\32\4\0\1\32\4\0\1\32\17\0"+ + "\1\32\1\0\1\32\1\0\1\32\1\0\1\32\1\0"+ + "\1\32\1\0\3\32\1\0\5\32\1\0\3\32\1\0"+ + "\11\32\1\0\2\32\1\0\16\32\1\0\2\32\1\0"+ + "\21\32\1\0\1\32\1\0\3\32\2\0\1\32\1\0"+ + "\1\32\1\0\2\32\1\0\1\32\17\0\1\32\1\0"+ + "\1\32\1\0\1\32\3\0\1\32\1\0\3\32\1\0"+ + "\2\32\1\0\2\32\1\0\3\32\1\0\11\32\1\0"+ + "\2\32\1\0\16\32\1\0\2\32\1\0\21\32\1\0"+ + "\1\32\1\0\3\32\2\0\1\32\1\0\1\32\1\0"+ + "\2\32\1\0\1\32\17\0\1\32\11\0\1\32\20\0"+ + "\1\32\33\0\1\32\21\0\1\32\10\0\1\32\24\0"+ + "\1\32\1\0\1\32\1\0\1\32\1\0\1\32\1\0"+ + "\1\32\1\0\3\32\1\0\5\32\1\0\3\32\1\0"+ + "\6\32\1\0\2\32\1\0\2\32\1\0\10\32\1\0"+ + "\5\32\1\0\2\32\1\0\21\32\1\0\1\32\1\0"+ + "\3\32\2\0\1\32\1\0\1\32\1\0\2\32\1\0"+ + "\1\32\146\0\1\33\16\0\1\35\1\0\1\36\1\0"+ + "\1\37\1\0\1\40\1\0\1\41\1\0\1\42\3\0"+ + "\1\43\5\0\1\44\3\0\1\45\11\0\1\46\2\0"+ + "\1\47\16\0\1\50\2\0\1\51\41\0\2\25\1\52"+ + "\1\0\1\53\1\0\1\53\1\54\1\0\1\25\2\0"+ + "\1\25\1\0\1\35\1\0\1\36\1\0\1\37\1\0"+ + "\1\40\1\0\1\41\1\0\1\55\3\0\1\56\5\0"+ + "\1\57\3\0\1\60\11\0\1\46\2\0\1\61\16\0"+ + "\1\62\2\0\1\63\41\0\1\25\2\26\2\0\2\64"+ + "\1\65\1\0\1\26\2\0\1\25\13\0\1\66\15\0"+ + "\1\67\14\0\1\70\16\0\1\71\2\0\1\72\21\0"+ + "\1\73\20\0\1\27\1\0\1\27\3\0\1\54\1\0"+ + "\1\27\4\0\1\35\1\0\1\36\1\0\1\37\1\0"+ + "\1\40\1\0\1\41\1\0\1\74\3\0\1\56\5\0"+ + "\1\57\3\0\1\75\11\0\1\46\2\0\1\76\16\0"+ + "\1\77\2\0\1\100\21\0\1\101\17\0\1\25\1\102"+ + "\1\26\1\103\3\0\1\102\1\0\1\102\2\0\1\25"+ + "\142\0\2\31\4\0\1\35\1\0\1\36\1\0\1\37"+ + "\1\0\1\40\1\0\1\41\1\0\1\104\3\0\1\43"+ + "\5\0\1\44\3\0\1\105\11\0\1\46\2\0\1\106"+ + "\16\0\1\107\2\0\1\110\41\0\1\25\1\34\1\52"+ + "\1\0\1\53\1\0\1\53\1\54\1\0\1\34\2\0"+ + "\1\34\2\0\1\25\11\0\3\25\5\0\1\25\1\0"+ + "\1\25\1\0\1\25\4\0\1\25\4\0\1\25\1\0"+ + "\2\25\4\0\1\25\5\0\1\25\3\0\1\25\4\0"+ + "\5\25\10\0\1\52\1\0\2\25\1\0\1\25\10\0"+ + "\1\25\24\0\1\25\1\0\1\52\7\0\2\25\2\0"+ + "\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+ + "\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+ + "\4\25\1\0\5\25\1\52\1\0\1\25\1\0\1\25"+ + "\1\0\3\25\2\0\1\25\1\0\1\25\1\0\1\25"+ + "\2\0\1\25\17\0\1\25\3\0\1\25\5\0\2\25"+ + "\3\0\1\25\4\0\3\25\4\0\1\25\1\0\1\25"+ + "\2\0\1\25\1\0\2\25\4\0\1\25\1\0\1\25"+ + "\3\0\2\25\1\0\1\25\5\0\3\25\1\0\1\25"+ + "\10\0\1\25\1\0\2\52\1\0\1\25\10\0\1\25"+ + "\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+ + "\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+ + "\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+ + "\1\0\3\25\1\0\1\25\1\0\2\25\4\0\3\25"+ + "\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+ + "\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+ + "\1\0\1\25\5\0\3\25\5\0\1\25\2\0\2\25"+ + "\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\25"+ + "\6\0\1\25\56\0\1\25\3\0\1\25\2\0\1\25"+ + "\3\0\1\25\5\0\1\25\7\0\1\25\4\0\2\25"+ + "\3\0\2\25\1\0\1\25\4\0\1\25\1\0\1\25"+ + "\2\0\2\25\1\0\3\25\1\0\1\25\2\0\4\25"+ + "\2\0\1\25\41\0\1\35\1\0\1\36\1\0\1\37"+ + "\1\0\1\40\1\0\1\41\1\0\1\111\3\0\1\43"+ + "\5\0\1\44\3\0\1\112\11\0\1\46\2\0\1\113"+ + "\16\0\1\114\2\0\1\115\41\0\1\25\2\52\2\0"+ + "\2\116\1\54\1\0\1\52\2\0\1\25\1\0\1\35"+ + "\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+ + "\1\0\1\117\3\0\1\120\5\0\1\121\3\0\1\122"+ + "\11\0\1\46\2\0\1\123\16\0\1\124\2\0\1\125"+ + "\41\0\1\25\1\53\7\0\1\53\2\0\1\25\1\0"+ + "\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+ + "\1\41\1\0\1\126\3\0\1\43\5\0\1\44\3\0"+ + "\1\127\11\0\1\46\2\0\1\130\16\0\1\131\2\0"+ + "\1\132\21\0\1\101\17\0\1\25\1\54\1\52\1\103"+ + "\3\0\1\54\1\0\1\54\2\0\1\25\2\0\1\26"+ + "\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+ + "\4\0\1\25\4\0\1\26\1\0\2\26\4\0\1\25"+ + "\5\0\1\25\3\0\1\26\4\0\1\26\2\25\2\26"+ + "\10\0\1\26\1\0\2\25\1\0\1\26\10\0\1\25"+ + "\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+ + "\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+ + "\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+ + "\1\0\3\25\1\0\1\26\1\0\2\25\4\0\3\25"+ + "\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+ + "\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+ + "\1\0\1\25\5\0\1\25\2\26\5\0\1\25\2\0"+ + "\1\25\1\26\4\0\1\25\2\0\1\25\1\0\1\25"+ + "\103\0\2\26\6\0\1\26\56\0\1\26\3\0\1\26"+ + "\2\0\1\26\3\0\1\26\5\0\1\26\7\0\1\26"+ + "\4\0\2\26\3\0\2\26\1\0\1\26\4\0\1\26"+ + "\1\0\1\26\2\0\2\26\1\0\3\26\1\0\1\26"+ + "\2\0\4\26\2\0\1\26\53\0\1\133\3\0\1\134"+ + "\5\0\1\135\3\0\1\136\14\0\1\137\16\0\1\140"+ + "\2\0\1\141\42\0\1\64\1\26\6\0\1\64\4\0"+ + "\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+ + "\1\41\1\0\1\142\3\0\1\56\5\0\1\57\3\0"+ + "\1\143\11\0\1\46\2\0\1\144\16\0\1\145\2\0"+ + "\1\146\21\0\1\101\17\0\1\25\1\65\1\26\1\103"+ + "\3\0\1\65\1\0\1\65\2\0\1\25\2\0\1\27"+ + "\37\0\1\27\1\0\2\27\16\0\1\27\4\0\1\27"+ + "\2\0\2\27\15\0\1\27\132\0\1\27\153\0\2\27"+ + "\11\0\1\27\115\0\2\27\6\0\1\27\56\0\1\27"+ + "\3\0\1\27\2\0\1\27\3\0\1\27\5\0\1\27"+ + "\7\0\1\27\4\0\2\27\3\0\2\27\1\0\1\27"+ + "\4\0\1\27\1\0\1\27\2\0\2\27\1\0\3\27"+ + "\1\0\1\27\2\0\4\27\2\0\1\27\153\0\1\27"+ + "\35\0\1\102\11\0\3\25\5\0\1\25\1\0\1\25"+ + "\1\0\1\25\4\0\1\25\4\0\1\102\1\0\2\102"+ + "\4\0\1\25\5\0\1\25\3\0\1\102\4\0\1\102"+ + "\2\25\2\102\10\0\1\26\1\0\2\25\1\0\1\102"+ + "\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+ + "\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+ + "\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+ + "\1\0\1\25\1\0\3\25\1\0\1\102\1\0\2\25"+ + "\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+ + "\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+ + "\5\0\3\25\1\0\1\25\5\0\1\25\2\102\5\0"+ + "\1\25\2\0\1\25\1\102\4\0\1\25\2\0\1\25"+ + "\1\0\1\25\103\0\2\102\6\0\1\102\56\0\1\102"+ + "\3\0\1\102\2\0\1\102\3\0\1\102\5\0\1\102"+ + "\7\0\1\102\4\0\2\102\3\0\2\102\1\0\1\102"+ + "\4\0\1\102\1\0\1\102\2\0\2\102\1\0\3\102"+ + "\1\0\1\102\2\0\4\102\2\0\1\102\153\0\1\103"+ + "\46\0\1\147\15\0\1\150\14\0\1\151\16\0\1\152"+ + "\2\0\1\153\21\0\1\101\20\0\1\103\1\0\1\103"+ + "\3\0\1\54\1\0\1\103\5\0\1\34\11\0\3\25"+ + "\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+ + "\4\0\1\34\1\0\2\34\4\0\1\25\5\0\1\25"+ + "\3\0\1\34\4\0\1\34\2\25\2\34\10\0\1\52"+ + "\1\0\2\25\1\0\1\34\10\0\1\25\24\0\1\25"+ + "\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+ + "\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+ + "\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+ + "\1\0\1\34\1\0\2\25\4\0\3\25\1\0\1\25"+ + "\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+ + "\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+ + "\5\0\1\25\2\34\5\0\1\25\2\0\1\25\1\34"+ + "\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\34"+ + "\6\0\1\34\56\0\1\34\3\0\1\34\2\0\1\34"+ + "\3\0\1\34\5\0\1\34\7\0\1\34\4\0\2\34"+ + "\3\0\2\34\1\0\1\34\4\0\1\34\1\0\1\34"+ + "\2\0\2\34\1\0\3\34\1\0\1\34\2\0\4\34"+ + "\2\0\1\34\42\0\1\52\11\0\3\25\5\0\1\25"+ + "\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\52"+ + "\1\0\2\52\4\0\1\25\5\0\1\25\3\0\1\52"+ + "\4\0\1\52\2\25\2\52\10\0\1\52\1\0\2\25"+ + "\1\0\1\52\10\0\1\25\24\0\1\25\3\0\1\25"+ + "\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+ + "\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+ + "\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\52"+ + "\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+ + "\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+ + "\32\0\15\25\5\0\3\25\1\0\1\25\5\0\1\25"+ + "\2\52\5\0\1\25\2\0\1\25\1\52\4\0\1\25"+ + "\2\0\1\25\1\0\1\25\103\0\2\52\6\0\1\52"+ + "\56\0\1\52\3\0\1\52\2\0\1\52\3\0\1\52"+ + "\5\0\1\52\7\0\1\52\4\0\2\52\3\0\2\52"+ + "\1\0\1\52\4\0\1\52\1\0\1\52\2\0\2\52"+ + "\1\0\3\52\1\0\1\52\2\0\4\52\2\0\1\52"+ + "\53\0\1\154\3\0\1\155\5\0\1\156\3\0\1\157"+ + "\14\0\1\160\16\0\1\161\2\0\1\162\42\0\1\116"+ + "\1\52\6\0\1\116\5\0\1\53\11\0\3\25\5\0"+ + "\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+ + "\1\53\1\0\2\53\4\0\1\25\5\0\1\25\3\0"+ + "\1\53\4\0\1\53\2\25\2\53\12\0\2\25\1\0"+ + "\1\53\10\0\1\25\24\0\1\25\11\0\2\25\2\0"+ + "\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+ + "\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+ + "\4\25\1\0\5\25\2\0\1\25\1\0\1\25\1\0"+ + "\3\25\2\0\1\25\1\0\1\25\1\0\1\25\2\0"+ + "\1\25\17\0\1\25\3\0\1\25\5\0\2\25\3\0"+ + "\1\25\4\0\3\25\4\0\1\25\1\0\1\25\2\0"+ + "\1\25\1\0\2\25\4\0\1\25\1\0\1\25\3\0"+ + "\2\25\1\0\1\25\5\0\3\25\1\0\1\25\10\0"+ + "\1\25\4\0\1\25\10\0\1\25\24\0\1\25\3\0"+ + "\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+ + "\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+ + "\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+ + "\1\53\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+ + "\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+ + "\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+ + "\1\25\2\53\5\0\1\25\2\0\1\25\1\53\4\0"+ + "\1\25\2\0\1\25\1\0\1\25\103\0\2\53\6\0"+ + "\1\53\56\0\1\53\3\0\1\53\2\0\1\53\3\0"+ + "\1\53\5\0\1\53\7\0\1\53\4\0\2\53\3\0"+ + "\2\53\1\0\1\53\4\0\1\53\1\0\1\53\2\0"+ + "\2\53\1\0\3\53\1\0\1\53\2\0\4\53\2\0"+ + "\1\53\42\0\1\54\11\0\3\25\5\0\1\25\1\0"+ + "\1\25\1\0\1\25\4\0\1\25\4\0\1\54\1\0"+ + "\2\54\4\0\1\25\5\0\1\25\3\0\1\54\4\0"+ + "\1\54\2\25\2\54\10\0\1\52\1\0\2\25\1\0"+ + "\1\54\10\0\1\25\24\0\1\25\3\0\1\25\6\0"+ + "\2\25\5\0\1\25\1\0\1\25\1\0\1\25\1\0"+ + "\11\25\2\0\1\25\4\0\1\25\4\0\6\25\2\0"+ + "\1\25\1\0\1\25\1\0\3\25\1\0\1\54\1\0"+ + "\2\25\4\0\3\25\1\0\1\25\10\0\1\25\1\0"+ + "\2\25\21\0\1\25\3\0\1\25\5\0\1\25\32\0"+ + "\15\25\5\0\3\25\1\0\1\25\5\0\1\25\2\54"+ + "\5\0\1\25\2\0\1\25\1\54\4\0\1\25\2\0"+ + "\1\25\1\0\1\25\103\0\2\54\6\0\1\54\56\0"+ + "\1\54\3\0\1\54\2\0\1\54\3\0\1\54\5\0"+ + "\1\54\7\0\1\54\4\0\2\54\3\0\2\54\1\0"+ + "\1\54\4\0\1\54\1\0\1\54\2\0\2\54\1\0"+ + "\3\54\1\0\1\54\2\0\4\54\2\0\1\54\42\0"+ + "\1\64\37\0\1\64\1\0\2\64\16\0\1\64\4\0"+ + "\1\64\2\0\2\64\10\0\1\26\4\0\1\64\37\0"+ + "\1\26\102\0\1\26\147\0\2\26\134\0\1\64\153\0"+ + "\2\64\11\0\1\64\115\0\2\64\6\0\1\64\56\0"+ + "\1\64\3\0\1\64\2\0\1\64\3\0\1\64\5\0"+ + "\1\64\7\0\1\64\4\0\2\64\3\0\2\64\1\0"+ + "\1\64\4\0\1\64\1\0\1\64\2\0\2\64\1\0"+ + "\3\64\1\0\1\64\2\0\4\64\2\0\1\64\42\0"+ + "\1\65\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+ + "\1\25\4\0\1\25\4\0\1\65\1\0\2\65\4\0"+ + "\1\25\5\0\1\25\3\0\1\65\4\0\1\65\2\25"+ + "\2\65\10\0\1\26\1\0\2\25\1\0\1\65\10\0"+ + "\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+ + "\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+ + "\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+ + "\1\25\1\0\3\25\1\0\1\65\1\0\2\25\4\0"+ + "\3\25\1\0\1\25\10\0\1\25\1\0\2\25\21\0"+ + "\1\25\3\0\1\25\5\0\1\25\32\0\15\25\5\0"+ + "\3\25\1\0\1\25\5\0\1\25\2\65\5\0\1\25"+ + "\2\0\1\25\1\65\4\0\1\25\2\0\1\25\1\0"+ + "\1\25\103\0\2\65\6\0\1\65\56\0\1\65\3\0"+ + "\1\65\2\0\1\65\3\0\1\65\5\0\1\65\7\0"+ + "\1\65\4\0\2\65\3\0\2\65\1\0\1\65\4\0"+ + "\1\65\1\0\1\65\2\0\2\65\1\0\3\65\1\0"+ + "\1\65\2\0\4\65\2\0\1\65\42\0\1\103\37\0"+ + "\1\103\1\0\2\103\16\0\1\103\4\0\1\103\2\0"+ + "\2\103\15\0\1\103\132\0\1\103\153\0\2\103\11\0"+ + "\1\103\115\0\2\103\6\0\1\103\56\0\1\103\3\0"+ + "\1\103\2\0\1\103\3\0\1\103\5\0\1\103\7\0"+ + "\1\103\4\0\2\103\3\0\2\103\1\0\1\103\4\0"+ + "\1\103\1\0\1\103\2\0\2\103\1\0\3\103\1\0"+ + "\1\103\2\0\4\103\2\0\1\103\42\0\1\116\37\0"+ + "\1\116\1\0\2\116\16\0\1\116\4\0\1\116\2\0"+ + "\2\116\10\0\1\52\4\0\1\116\37\0\1\52\102\0"+ + "\1\52\147\0\2\52\134\0\1\116\153\0\2\116\11\0"+ + "\1\116\115\0\2\116\6\0\1\116\56\0\1\116\3\0"+ + "\1\116\2\0\1\116\3\0\1\116\5\0\1\116\7\0"+ + "\1\116\4\0\2\116\3\0\2\116\1\0\1\116\4\0"+ + "\1\116\1\0\1\116\2\0\2\116\1\0\3\116\1\0"+ + "\1\116\2\0\4\116\2\0\1\116\40\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[10609]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\27\1\2\11\1\1\15\0\1\1\1\0"+ + "\1\1\10\0\1\1\15\0\1\1\57\0"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[114]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /** Alphanumeric sequences */ + public static final int WORD_TYPE = StandardTokenizer.ALPHANUM; + + /** Numbers */ + public static final int NUMERIC_TYPE = StandardTokenizer.NUM; + + /** + * Chars in class \p{Line_Break = Complex_Context} are from South East Asian + * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept + * together as as a single token rather than broken up, because the logic + * required to break them at word boundaries is too complex for UAX#29. + *

      + * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA + */ + public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN; + + public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC; + + public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA; + + public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA; + + public static final int HANGUL_TYPE = StandardTokenizer.HANGUL; + + public final int yychar() + { + return yychar; + } + + /** + * Fills CharTermAttribute with the current token text. + */ + public final void getText(CharTermAttribute t) { + t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public StandardTokenizerImpl31(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public StandardTokenizerImpl31(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 2650) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * Internal scan buffer is resized down to its initial length, if it has grown. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + if (zzBuffer.length > ZZ_BUFFERSIZE) + zzBuffer = new char[ZZ_BUFFERSIZE]; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public int getNextToken() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + yychar+= zzMarkedPosL-zzStartRead; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + // set up zzAction for empty match case: + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + } + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 2: + { return WORD_TYPE; + } + case 9: break; + case 5: + { return SOUTH_EAST_ASIAN_TYPE; + } + case 10: break; + case 4: + { return KATAKANA_TYPE; + } + case 11: break; + case 6: + { return IDEOGRAPHIC_TYPE; + } + case 12: break; + case 1: + { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ + } + case 13: break; + case 8: + { return HANGUL_TYPE; + } + case 14: break; + case 3: + { return NUMERIC_TYPE; + } + case 15: break; + case 7: + { return HIRAGANA_TYPE; + } + case 16: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + { + return StandardTokenizerInterface.YYEOF; + } + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex new file mode 100644 index 00000000000..b8272fd4f52 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex @@ -0,0 +1,184 @@ +package org.apache.lucene.analysis.standard.std31; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizerInterface; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * This class implements StandardTokenizer, except with a bug + * (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana + * characters would be split from combining characters: + * @deprecated This class is only for exact backwards compatibility + */ +@Deprecated +%% + +%unicode 6.0 +%integer +%final +%public +%class StandardTokenizerImpl31 +%implements StandardTokenizerInterface +%function getNextToken +%char + +%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro +ALetter = ([\p{WB:ALetter}] | {ALetterSupp}) +Format = ([\p{WB:Format}] | {FormatSupp}) +Numeric = ([\p{WB:Numeric}] | {NumericSupp}) +Extend = ([\p{WB:Extend}] | {ExtendSupp}) +Katakana = ([\p{WB:Katakana}] | {KatakanaSupp}) +MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp}) +MidNum = ([\p{WB:MidNum}] | {MidNumSupp}) +MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp}) +ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp}) +ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp}) +Han = ([\p{Script:Han}] | {HanSupp}) +Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp}) + +// Script=Hangul & Aletter +HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})* +// UAX#29 WB4. X (Extend | Format)* --> X +// +ALetterEx = {ALetter} ({Format} | {Extend})* +// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it +NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})* +KatakanaEx = {Katakana} ({Format} | {Extend})* +MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})* +MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})* +ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})* + + +%{ + /** Alphanumeric sequences */ + public static final int WORD_TYPE = StandardTokenizer.ALPHANUM; + + /** Numbers */ + public static final int NUMERIC_TYPE = StandardTokenizer.NUM; + + /** + * Chars in class \p{Line_Break = Complex_Context} are from South East Asian + * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept + * together as as a single token rather than broken up, because the logic + * required to break them at word boundaries is too complex for UAX#29. + *

      + * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA + */ + public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN; + + public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC; + + public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA; + + public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA; + + public static final int HANGUL_TYPE = StandardTokenizer.HANGUL; + + public final int yychar() + { + return yychar; + } + + /** + * Fills CharTermAttribute with the current token text. + */ + public final void getText(CharTermAttribute t) { + t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); + } +%} + +%% + +// UAX#29 WB1. sot ÷ +// WB2. ÷ eot +// +<> { return StandardTokenizerInterface.YYEOF; } + +// UAX#29 WB8. Numeric × Numeric +// WB11. Numeric (MidNum | MidNumLet) × Numeric +// WB12. Numeric × (MidNum | MidNumLet) Numeric +// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet +// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana) +// +{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} + | {MidNumericEx} {NumericEx} + | {NumericEx})* +{ExtendNumLetEx}* + { return NUMERIC_TYPE; } + +// subset of the below for typing purposes only! +{HangulEx}+ + { return HANGUL_TYPE; } + +{KatakanaEx}+ + { return KATAKANA_TYPE; } + +// UAX#29 WB5. ALetter × ALetter +// WB6. ALetter × (MidLetter | MidNumLet) ALetter +// WB7. ALetter (MidLetter | MidNumLet) × ALetter +// WB9. ALetter × Numeric +// WB10. Numeric × ALetter +// WB13. Katakana × Katakana +// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet +// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana) +// +{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* + | ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})* + | {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) +({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* + | ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})* + | {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )* +{ExtendNumLetEx}* + { return WORD_TYPE; } + + +// From UAX #29: +// +// [C]haracters with the Line_Break property values of Contingent_Break (CB), +// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word +// boundary property values based on criteria outside of the scope of this +// annex. That means that satisfactory treatment of languages like Chinese +// or Thai requires special handling. +// +// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break} +// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER. +// +// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context} +// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer, +// Lao, etc.) are kept together. This grammar does the same below. +// +// See also the Unicode Line Breaking Algorithm: +// +// http://www.unicode.org/reports/tr14/#SA +// +{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; } + +// UAX#29 WB14. Any ÷ Any +// +{Han} { return IDEOGRAPHIC_TYPE; } +{Hiragana} { return HIRAGANA_TYPE; } + + +// UAX#29 WB3. CR × LF +// WB3a. (Newline | CR | LF) ÷ +// WB3b. ÷ (Newline | CR | LF) +// WB14. Any ÷ Any +// +[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java index 17bae9e24e5..1c35093fc7c 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java @@ -6,6 +6,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.Version; import java.io.IOException; import java.io.Reader; @@ -221,6 +222,23 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase { new String[] { "", "", "", "", "" }); } + public void testCombiningMarks() throws Exception { + checkOneTerm(a, "ざ", "ざ"); // hiragana + checkOneTerm(a, "ザ", "ザ"); // katakana + checkOneTerm(a, "壹゙", "壹゙"); // ideographic + checkOneTerm(a, "아゙", "아゙"); // hangul + } + + /** @deprecated remove this and sophisticated backwards layer in 5.0 */ + @Deprecated + public void testCombiningMarksBackwards() throws Exception { + Analyzer a = new StandardAnalyzer(Version.LUCENE_33); + checkOneTerm(a, "ざ", "さ"); // hiragana Bug + checkOneTerm(a, "ザ", "ザ"); // katakana Works + checkOneTerm(a, "壹゙", "壹"); // ideographic Bug + checkOneTerm(a, "아゙", "아゙"); // hangul Works + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);