LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached to Han/Hiragana

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1154005 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-08-04 20:49:47 +00:00
parent ca05564005
commit 2dda5bc35f
10 changed files with 1566 additions and 233 deletions

View File

@ -535,6 +535,11 @@ Bug fixes
suppressed exceptions in the original exception, so stack trace suppressed exceptions in the original exception, so stack trace
will contain them. (Uwe Schindler) will contain them. (Uwe Schindler)
* LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached
to Han or Hiragana characters, this is fixed if you supply Version >= 3.4
If you supply a previous lucene version, you get the old buggy behavior
for backwards compatibility. (Trejkaz, Robert Muir)
New Features New Features
* LUCENE-3290: Added FieldInvertState.numUniqueTerms * LUCENE-3290: Added FieldInvertState.numUniqueTerms

View File

@ -67,6 +67,9 @@
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex" <jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard" outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" /> nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std31"
nobak="on" />
</target> </target>
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present"> <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">

View File

@ -39,6 +39,9 @@ import java.util.Set;
* <p>You must specify the required {@link Version} * <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer: * compatibility when creating StandardAnalyzer:
* <ul> * <ul>
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
* from their combining characters. If you use a previous version number,
* you get the exact broken behavior for backwards compatibility.
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation, * <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
* and StopFilter correctly handles Unicode 4.0 supplementary characters * and StopFilter correctly handles Unicode 4.0 supplementary characters
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer} * in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -42,6 +43,9 @@ import org.apache.lucene.util.Version;
* <p>You must specify the required {@link Version} * <p>You must specify the required {@link Version}
* compatibility when creating StandardTokenizer: * compatibility when creating StandardTokenizer:
* <ul> * <ul>
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
* from their combining characters. If you use a previous version number,
* you get the exact broken behavior for backwards compatibility.
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation. * <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
* If you use a previous version number, you get the exact behavior of * If you use a previous version number, you get the exact behavior of
* {@link ClassicTokenizer} for backwards compatibility. * {@link ClassicTokenizer} for backwards compatibility.
@ -142,8 +146,13 @@ public final class StandardTokenizer extends Tokenizer {
} }
private final void init(Reader input, Version matchVersion) { private final void init(Reader input, Version matchVersion) {
this.scanner = matchVersion.onOrAfter(Version.LUCENE_31) ? if (matchVersion.onOrAfter(Version.LUCENE_34)) {
new StandardTokenizerImpl(input) : new ClassicTokenizerImpl(input); this.scanner = new StandardTokenizerImpl(input);
} else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
this.scanner = new StandardTokenizerImpl31(input);
} else {
this.scanner = new ClassicTokenizerImpl(input);
}
this.input = input; this.input = input;
} }

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */ /* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 4:07 PM */
package org.apache.lucene.analysis.standard; package org.apache.lucene.analysis.standard;
@ -209,10 +209,10 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
private static final String ZZ_ACTION_PACKED_0 = private static final String ZZ_ACTION_PACKED_0 =
"\1\0\23\1\1\2\1\3\1\4\1\1\1\5\1\6"+ "\1\0\23\1\1\2\1\3\1\4\1\1\1\5\1\6"+
"\1\7\1\10\15\0\1\2\1\0\1\2\10\0\1\3"+ "\1\7\1\10\15\0\1\2\1\0\1\2\10\0\1\3"+
"\15\0\1\2\57\0"; "\15\0\1\2\71\0";
private static int [] zzUnpackAction() { private static int [] zzUnpackAction() {
int [] result = new int[114]; int [] result = new int[124];
int offset = 0; int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result; return result;
@ -240,21 +240,22 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
"\0\0\0\147\0\316\0\u0135\0\u019c\0\u0203\0\u026a\0\u02d1"+ "\0\0\0\147\0\316\0\u0135\0\u019c\0\u0203\0\u026a\0\u02d1"+
"\0\u0338\0\u039f\0\u0406\0\u046d\0\u04d4\0\u053b\0\u05a2\0\u0609"+ "\0\u0338\0\u039f\0\u0406\0\u046d\0\u04d4\0\u053b\0\u05a2\0\u0609"+
"\0\u0670\0\u06d7\0\u073e\0\u07a5\0\u080c\0\u0873\0\u08da\0\u0941"+ "\0\u0670\0\u06d7\0\u073e\0\u07a5\0\u080c\0\u0873\0\u08da\0\u0941"+
"\0\u09a8\0\147\0\147\0\u0a0f\0\316\0\u0135\0\u019c\0\u0203"+ "\0\u09a8\0\u0a0f\0\u0a76\0\u0add\0\316\0\u0135\0\u019c\0\u0203"+
"\0\u026a\0\u0a76\0\u0add\0\u0b44\0\u0bab\0\u046d\0\u0c12\0\u0c79"+ "\0\u026a\0\u0b44\0\u0bab\0\u0c12\0\u0c79\0\u046d\0\u0ce0\0\u0d47"+
"\0\u0ce0\0\u0d47\0\u0dae\0\u0e15\0\u0e7c\0\u0338\0\u039f\0\u0ee3"+ "\0\u0dae\0\u0e15\0\u0e7c\0\u0ee3\0\u0f4a\0\u0338\0\u039f\0\u0fb1"+
"\0\u0f4a\0\u0fb1\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b"+ "\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b\0\u1282\0\u12e9"+
"\0\u1282\0\u12e9\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553"+ "\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553\0\u15ba\0\u1621"+
"\0\u15ba\0\u0941\0\u1621\0\u1688\0\u16ef\0\u1756\0\u17bd\0\u1824"+ "\0\u1688\0\u0941\0\u16ef\0\u1756\0\u17bd\0\u1824\0\u188b\0\u18f2"+
"\0\u188b\0\u18f2\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c"+ "\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c\0\u1bc3\0\u1c2a"+
"\0\u1bc3\0\u1c2a\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94"+ "\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94\0\u1efb\0\u1f62"+
"\0\u1efb\0\u1f62\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc"+ "\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc\0\u2233\0\u229a"+
"\0\u2233\0\u229a\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504"+ "\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504\0\u256b\0\u25d2"+
"\0\u256b\0\u25d2\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c"+ "\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c\0\u28a3\0\u290a"+
"\0\u28a3\0\u290a"; "\0\u2971\0\u29d8\0\u2a3f\0\u2aa6\0\u2b0d\0\u2b74\0\u2bdb\0\u2c42"+
"\0\u2ca9\0\u2d10\0\u2d77\0\u2dde";
private static int [] zzUnpackRowMap() { private static int [] zzUnpackRowMap() {
int [] result = new int[114]; int [] result = new int[124];
int offset = 0; int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result; return result;
@ -367,223 +368,241 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
"\1\57\3\0\1\75\11\0\1\46\2\0\1\76\16\0"+ "\1\57\3\0\1\75\11\0\1\46\2\0\1\76\16\0"+
"\1\77\2\0\1\100\21\0\1\101\17\0\1\25\1\102"+ "\1\77\2\0\1\100\21\0\1\101\17\0\1\25\1\102"+
"\1\26\1\103\3\0\1\102\1\0\1\102\2\0\1\25"+ "\1\26\1\103\3\0\1\102\1\0\1\102\2\0\1\25"+
"\142\0\2\31\4\0\1\35\1\0\1\36\1\0\1\37"+ "\142\0\2\31\16\0\1\104\15\0\1\105\14\0\1\106"+
"\1\0\1\40\1\0\1\41\1\0\1\104\3\0\1\43"+ "\16\0\1\107\2\0\1\110\42\0\1\32\7\0\1\32"+
"\5\0\1\44\3\0\1\105\11\0\1\46\2\0\1\106"+ "\16\0\1\111\15\0\1\112\14\0\1\113\16\0\1\114"+
"\16\0\1\107\2\0\1\110\41\0\1\25\1\34\1\52"+ "\2\0\1\115\42\0\1\33\7\0\1\33\4\0\1\35"+
"\1\0\1\53\1\0\1\53\1\54\1\0\1\34\2\0"+
"\1\34\2\0\1\25\11\0\3\25\5\0\1\25\1\0"+
"\1\25\1\0\1\25\4\0\1\25\4\0\1\25\1\0"+
"\2\25\4\0\1\25\5\0\1\25\3\0\1\25\4\0"+
"\5\25\10\0\1\52\1\0\2\25\1\0\1\25\10\0"+
"\1\25\24\0\1\25\1\0\1\52\7\0\2\25\2\0"+
"\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+
"\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+
"\4\25\1\0\5\25\1\52\1\0\1\25\1\0\1\25"+
"\1\0\3\25\2\0\1\25\1\0\1\25\1\0\1\25"+
"\2\0\1\25\17\0\1\25\3\0\1\25\5\0\2\25"+
"\3\0\1\25\4\0\3\25\4\0\1\25\1\0\1\25"+
"\2\0\1\25\1\0\2\25\4\0\1\25\1\0\1\25"+
"\3\0\2\25\1\0\1\25\5\0\3\25\1\0\1\25"+
"\10\0\1\25\1\0\2\52\1\0\1\25\10\0\1\25"+
"\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+
"\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+
"\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+
"\1\0\3\25\1\0\1\25\1\0\2\25\4\0\3\25"+
"\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+
"\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+
"\1\0\1\25\5\0\3\25\5\0\1\25\2\0\2\25"+
"\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\25"+
"\6\0\1\25\56\0\1\25\3\0\1\25\2\0\1\25"+
"\3\0\1\25\5\0\1\25\7\0\1\25\4\0\2\25"+
"\3\0\2\25\1\0\1\25\4\0\1\25\1\0\1\25"+
"\2\0\2\25\1\0\3\25\1\0\1\25\2\0\4\25"+
"\2\0\1\25\41\0\1\35\1\0\1\36\1\0\1\37"+
"\1\0\1\40\1\0\1\41\1\0\1\111\3\0\1\43"+
"\5\0\1\44\3\0\1\112\11\0\1\46\2\0\1\113"+
"\16\0\1\114\2\0\1\115\41\0\1\25\2\52\2\0"+
"\2\116\1\54\1\0\1\52\2\0\1\25\1\0\1\35"+
"\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+ "\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
"\1\0\1\117\3\0\1\120\5\0\1\121\3\0\1\122"+ "\1\0\1\116\3\0\1\43\5\0\1\44\3\0\1\117"+
"\11\0\1\46\2\0\1\123\16\0\1\124\2\0\1\125"+ "\11\0\1\46\2\0\1\120\16\0\1\121\2\0\1\122"+
"\41\0\1\25\1\53\7\0\1\53\2\0\1\25\1\0"+ "\41\0\1\25\1\34\1\52\1\0\1\53\1\0\1\53"+
"\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+ "\1\54\1\0\1\34\2\0\1\34\2\0\1\25\11\0"+
"\1\41\1\0\1\126\3\0\1\43\5\0\1\44\3\0"+ "\3\25\5\0\1\25\1\0\1\25\1\0\1\25\4\0"+
"\1\127\11\0\1\46\2\0\1\130\16\0\1\131\2\0"+ "\1\25\4\0\1\25\1\0\2\25\4\0\1\25\5\0"+
"\1\132\21\0\1\101\17\0\1\25\1\54\1\52\1\103"+ "\1\25\3\0\1\25\4\0\5\25\10\0\1\52\1\0"+
"\3\0\1\54\1\0\1\54\2\0\1\25\2\0\1\26"+ "\2\25\1\0\1\25\10\0\1\25\24\0\1\25\1\0"+
"\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+ "\1\52\7\0\2\25\2\0\5\25\2\0\2\25\4\0"+
"\4\0\1\25\4\0\1\26\1\0\2\26\4\0\1\25"+ "\6\25\1\0\2\25\4\0\5\25\1\0\5\25\1\0"+
"\5\0\1\25\3\0\1\26\4\0\1\26\2\25\2\26"+ "\2\25\1\0\3\25\1\0\4\25\1\0\5\25\1\52"+
"\10\0\1\26\1\0\2\25\1\0\1\26\10\0\1\25"+ "\1\0\1\25\1\0\1\25\1\0\3\25\2\0\1\25"+
"\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+ "\1\0\1\25\1\0\1\25\2\0\1\25\17\0\1\25"+
"\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+ "\3\0\1\25\5\0\2\25\3\0\1\25\4\0\3\25"+
"\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+ "\4\0\1\25\1\0\1\25\2\0\1\25\1\0\2\25"+
"\1\0\3\25\1\0\1\26\1\0\2\25\4\0\3\25"+ "\4\0\1\25\1\0\1\25\3\0\2\25\1\0\1\25"+
"\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+ "\5\0\3\25\1\0\1\25\10\0\1\25\1\0\2\52"+
"\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+ "\1\0\1\25\10\0\1\25\24\0\1\25\3\0\1\25"+
"\1\0\1\25\5\0\1\25\2\26\5\0\1\25\2\0"+
"\1\25\1\26\4\0\1\25\2\0\1\25\1\0\1\25"+
"\103\0\2\26\6\0\1\26\56\0\1\26\3\0\1\26"+
"\2\0\1\26\3\0\1\26\5\0\1\26\7\0\1\26"+
"\4\0\2\26\3\0\2\26\1\0\1\26\4\0\1\26"+
"\1\0\1\26\2\0\2\26\1\0\3\26\1\0\1\26"+
"\2\0\4\26\2\0\1\26\53\0\1\133\3\0\1\134"+
"\5\0\1\135\3\0\1\136\14\0\1\137\16\0\1\140"+
"\2\0\1\141\42\0\1\64\1\26\6\0\1\64\4\0"+
"\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+
"\1\41\1\0\1\142\3\0\1\56\5\0\1\57\3\0"+
"\1\143\11\0\1\46\2\0\1\144\16\0\1\145\2\0"+
"\1\146\21\0\1\101\17\0\1\25\1\65\1\26\1\103"+
"\3\0\1\65\1\0\1\65\2\0\1\25\2\0\1\27"+
"\37\0\1\27\1\0\2\27\16\0\1\27\4\0\1\27"+
"\2\0\2\27\15\0\1\27\132\0\1\27\153\0\2\27"+
"\11\0\1\27\115\0\2\27\6\0\1\27\56\0\1\27"+
"\3\0\1\27\2\0\1\27\3\0\1\27\5\0\1\27"+
"\7\0\1\27\4\0\2\27\3\0\2\27\1\0\1\27"+
"\4\0\1\27\1\0\1\27\2\0\2\27\1\0\3\27"+
"\1\0\1\27\2\0\4\27\2\0\1\27\153\0\1\27"+
"\35\0\1\102\11\0\3\25\5\0\1\25\1\0\1\25"+
"\1\0\1\25\4\0\1\25\4\0\1\102\1\0\2\102"+
"\4\0\1\25\5\0\1\25\3\0\1\102\4\0\1\102"+
"\2\25\2\102\10\0\1\26\1\0\2\25\1\0\1\102"+
"\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+
"\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+
"\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+
"\1\0\1\25\1\0\3\25\1\0\1\102\1\0\2\25"+
"\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+
"\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+
"\5\0\3\25\1\0\1\25\5\0\1\25\2\102\5\0"+
"\1\25\2\0\1\25\1\102\4\0\1\25\2\0\1\25"+
"\1\0\1\25\103\0\2\102\6\0\1\102\56\0\1\102"+
"\3\0\1\102\2\0\1\102\3\0\1\102\5\0\1\102"+
"\7\0\1\102\4\0\2\102\3\0\2\102\1\0\1\102"+
"\4\0\1\102\1\0\1\102\2\0\2\102\1\0\3\102"+
"\1\0\1\102\2\0\4\102\2\0\1\102\153\0\1\103"+
"\46\0\1\147\15\0\1\150\14\0\1\151\16\0\1\152"+
"\2\0\1\153\21\0\1\101\20\0\1\103\1\0\1\103"+
"\3\0\1\54\1\0\1\103\5\0\1\34\11\0\3\25"+
"\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+
"\4\0\1\34\1\0\2\34\4\0\1\25\5\0\1\25"+
"\3\0\1\34\4\0\1\34\2\25\2\34\10\0\1\52"+
"\1\0\2\25\1\0\1\34\10\0\1\25\24\0\1\25"+
"\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+
"\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+
"\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+
"\1\0\1\34\1\0\2\25\4\0\3\25\1\0\1\25"+
"\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+
"\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+
"\5\0\1\25\2\34\5\0\1\25\2\0\1\25\1\34"+
"\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\34"+
"\6\0\1\34\56\0\1\34\3\0\1\34\2\0\1\34"+
"\3\0\1\34\5\0\1\34\7\0\1\34\4\0\2\34"+
"\3\0\2\34\1\0\1\34\4\0\1\34\1\0\1\34"+
"\2\0\2\34\1\0\3\34\1\0\1\34\2\0\4\34"+
"\2\0\1\34\42\0\1\52\11\0\3\25\5\0\1\25"+
"\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\52"+
"\1\0\2\52\4\0\1\25\5\0\1\25\3\0\1\52"+
"\4\0\1\52\2\25\2\52\10\0\1\52\1\0\2\25"+
"\1\0\1\52\10\0\1\25\24\0\1\25\3\0\1\25"+
"\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+ "\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+
"\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+ "\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+
"\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\52"+ "\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\25"+
"\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+
"\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+
"\32\0\15\25\5\0\3\25\1\0\1\25\5\0\3\25"+
"\5\0\1\25\2\0\2\25\4\0\1\25\2\0\1\25"+
"\1\0\1\25\103\0\2\25\6\0\1\25\56\0\1\25"+
"\3\0\1\25\2\0\1\25\3\0\1\25\5\0\1\25"+
"\7\0\1\25\4\0\2\25\3\0\2\25\1\0\1\25"+
"\4\0\1\25\1\0\1\25\2\0\2\25\1\0\3\25"+
"\1\0\1\25\2\0\4\25\2\0\1\25\41\0\1\35"+
"\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
"\1\0\1\123\3\0\1\43\5\0\1\44\3\0\1\124"+
"\11\0\1\46\2\0\1\125\16\0\1\126\2\0\1\127"+
"\41\0\1\25\2\52\2\0\2\130\1\54\1\0\1\52"+
"\2\0\1\25\1\0\1\35\1\0\1\36\1\0\1\37"+
"\1\0\1\40\1\0\1\41\1\0\1\131\3\0\1\132"+
"\5\0\1\133\3\0\1\134\11\0\1\46\2\0\1\135"+
"\16\0\1\136\2\0\1\137\41\0\1\25\1\53\7\0"+
"\1\53\2\0\1\25\1\0\1\35\1\0\1\36\1\0"+
"\1\37\1\0\1\40\1\0\1\41\1\0\1\140\3\0"+
"\1\43\5\0\1\44\3\0\1\141\11\0\1\46\2\0"+
"\1\142\16\0\1\143\2\0\1\144\21\0\1\101\17\0"+
"\1\25\1\54\1\52\1\103\3\0\1\54\1\0\1\54"+
"\2\0\1\25\2\0\1\26\11\0\3\25\5\0\1\25"+
"\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\26"+
"\1\0\2\26\4\0\1\25\5\0\1\25\3\0\1\26"+
"\4\0\1\26\2\25\2\26\10\0\1\26\1\0\2\25"+
"\1\0\1\26\10\0\1\25\24\0\1\25\3\0\1\25"+
"\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+
"\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+
"\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\26"+
"\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+ "\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+
"\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+ "\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+
"\32\0\15\25\5\0\3\25\1\0\1\25\5\0\1\25"+ "\32\0\15\25\5\0\3\25\1\0\1\25\5\0\1\25"+
"\2\52\5\0\1\25\2\0\1\25\1\52\4\0\1\25"+ "\2\26\5\0\1\25\2\0\1\25\1\26\4\0\1\25"+
"\2\0\1\25\1\0\1\25\103\0\2\52\6\0\1\52"+ "\2\0\1\25\1\0\1\25\103\0\2\26\6\0\1\26"+
"\56\0\1\52\3\0\1\52\2\0\1\52\3\0\1\52"+ "\56\0\1\26\3\0\1\26\2\0\1\26\3\0\1\26"+
"\5\0\1\52\7\0\1\52\4\0\2\52\3\0\2\52"+ "\5\0\1\26\7\0\1\26\4\0\2\26\3\0\2\26"+
"\1\0\1\52\4\0\1\52\1\0\1\52\2\0\2\52"+ "\1\0\1\26\4\0\1\26\1\0\1\26\2\0\2\26"+
"\1\0\3\52\1\0\1\52\2\0\4\52\2\0\1\52"+ "\1\0\3\26\1\0\1\26\2\0\4\26\2\0\1\26"+
"\53\0\1\154\3\0\1\155\5\0\1\156\3\0\1\157"+ "\53\0\1\145\3\0\1\146\5\0\1\147\3\0\1\150"+
"\14\0\1\160\16\0\1\161\2\0\1\162\42\0\1\116"+ "\14\0\1\151\16\0\1\152\2\0\1\153\42\0\1\64"+
"\1\52\6\0\1\116\5\0\1\53\11\0\3\25\5\0"+ "\1\26\6\0\1\64\4\0\1\35\1\0\1\36\1\0"+
"\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+ "\1\37\1\0\1\40\1\0\1\41\1\0\1\154\3\0"+
"\1\53\1\0\2\53\4\0\1\25\5\0\1\25\3\0"+ "\1\56\5\0\1\57\3\0\1\155\11\0\1\46\2\0"+
"\1\53\4\0\1\53\2\25\2\53\12\0\2\25\1\0"+ "\1\156\16\0\1\157\2\0\1\160\21\0\1\101\17\0"+
"\1\53\10\0\1\25\24\0\1\25\11\0\2\25\2\0"+ "\1\25\1\65\1\26\1\103\3\0\1\65\1\0\1\65"+
"\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+ "\2\0\1\25\2\0\1\27\37\0\1\27\1\0\2\27"+
"\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+ "\16\0\1\27\4\0\1\27\2\0\2\27\15\0\1\27"+
"\4\25\1\0\5\25\2\0\1\25\1\0\1\25\1\0"+ "\132\0\1\27\153\0\2\27\11\0\1\27\115\0\2\27"+
"\3\25\2\0\1\25\1\0\1\25\1\0\1\25\2\0"+ "\6\0\1\27\56\0\1\27\3\0\1\27\2\0\1\27"+
"\1\25\17\0\1\25\3\0\1\25\5\0\2\25\3\0"+ "\3\0\1\27\5\0\1\27\7\0\1\27\4\0\2\27"+
"\1\25\4\0\3\25\4\0\1\25\1\0\1\25\2\0"+ "\3\0\2\27\1\0\1\27\4\0\1\27\1\0\1\27"+
"\1\25\1\0\2\25\4\0\1\25\1\0\1\25\3\0"+ "\2\0\2\27\1\0\3\27\1\0\1\27\2\0\4\27"+
"\2\25\1\0\1\25\5\0\3\25\1\0\1\25\10\0"+ "\2\0\1\27\153\0\1\27\35\0\1\102\11\0\3\25"+
"\1\25\4\0\1\25\10\0\1\25\24\0\1\25\3\0"+ "\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+
"\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+ "\4\0\1\102\1\0\2\102\4\0\1\25\5\0\1\25"+
"\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+ "\3\0\1\102\4\0\1\102\2\25\2\102\10\0\1\26"+
"\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+ "\1\0\2\25\1\0\1\102\10\0\1\25\24\0\1\25"+
"\1\53\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+ "\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+
"\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+ "\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+
"\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+ "\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+
"\1\25\2\53\5\0\1\25\2\0\1\25\1\53\4\0"+ "\1\0\1\102\1\0\2\25\4\0\3\25\1\0\1\25"+
"\1\25\2\0\1\25\1\0\1\25\103\0\2\53\6\0"+ "\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+
"\1\53\56\0\1\53\3\0\1\53\2\0\1\53\3\0"+ "\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+
"\1\53\5\0\1\53\7\0\1\53\4\0\2\53\3\0"+ "\5\0\1\25\2\102\5\0\1\25\2\0\1\25\1\102"+
"\2\53\1\0\1\53\4\0\1\53\1\0\1\53\2\0"+ "\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\102"+
"\2\53\1\0\3\53\1\0\1\53\2\0\4\53\2\0"+ "\6\0\1\102\56\0\1\102\3\0\1\102\2\0\1\102"+
"\1\53\42\0\1\54\11\0\3\25\5\0\1\25\1\0"+ "\3\0\1\102\5\0\1\102\7\0\1\102\4\0\2\102"+
"\1\25\1\0\1\25\4\0\1\25\4\0\1\54\1\0"+ "\3\0\2\102\1\0\1\102\4\0\1\102\1\0\1\102"+
"\2\54\4\0\1\25\5\0\1\25\3\0\1\54\4\0"+ "\2\0\2\102\1\0\3\102\1\0\1\102\2\0\4\102"+
"\1\54\2\25\2\54\10\0\1\52\1\0\2\25\1\0"+ "\2\0\1\102\153\0\1\103\46\0\1\161\15\0\1\162"+
"\1\54\10\0\1\25\24\0\1\25\3\0\1\25\6\0"+ "\14\0\1\163\16\0\1\164\2\0\1\165\21\0\1\101"+
"\2\25\5\0\1\25\1\0\1\25\1\0\1\25\1\0"+ "\20\0\1\103\1\0\1\103\3\0\1\54\1\0\1\103"+
"\11\25\2\0\1\25\4\0\1\25\4\0\6\25\2\0"+ "\5\0\1\32\37\0\1\32\1\0\2\32\16\0\1\32"+
"\1\25\1\0\1\25\1\0\3\25\1\0\1\54\1\0"+ "\4\0\1\32\2\0\2\32\15\0\1\32\132\0\1\32"+
"\2\25\4\0\3\25\1\0\1\25\10\0\1\25\1\0"+ "\153\0\2\32\11\0\1\32\115\0\2\32\6\0\1\32"+
"\2\25\21\0\1\25\3\0\1\25\5\0\1\25\32\0"+ "\56\0\1\32\3\0\1\32\2\0\1\32\3\0\1\32"+
"\15\25\5\0\3\25\1\0\1\25\5\0\1\25\2\54"+ "\5\0\1\32\7\0\1\32\4\0\2\32\3\0\2\32"+
"\5\0\1\25\2\0\1\25\1\54\4\0\1\25\2\0"+ "\1\0\1\32\4\0\1\32\1\0\1\32\2\0\2\32"+
"\1\25\1\0\1\25\103\0\2\54\6\0\1\54\56\0"+ "\1\0\3\32\1\0\1\32\2\0\4\32\2\0\1\32"+
"\1\54\3\0\1\54\2\0\1\54\3\0\1\54\5\0"+ "\42\0\1\33\37\0\1\33\1\0\2\33\16\0\1\33"+
"\1\54\7\0\1\54\4\0\2\54\3\0\2\54\1\0"+ "\4\0\1\33\2\0\2\33\15\0\1\33\132\0\1\33"+
"\1\54\4\0\1\54\1\0\1\54\2\0\2\54\1\0"+ "\153\0\2\33\11\0\1\33\115\0\2\33\6\0\1\33"+
"\3\54\1\0\1\54\2\0\4\54\2\0\1\54\42\0"+ "\56\0\1\33\3\0\1\33\2\0\1\33\3\0\1\33"+
"\1\64\37\0\1\64\1\0\2\64\16\0\1\64\4\0"+ "\5\0\1\33\7\0\1\33\4\0\2\33\3\0\2\33"+
"\1\64\2\0\2\64\10\0\1\26\4\0\1\64\37\0"+ "\1\0\1\33\4\0\1\33\1\0\1\33\2\0\2\33"+
"\1\26\102\0\1\26\147\0\2\26\134\0\1\64\153\0"+ "\1\0\3\33\1\0\1\33\2\0\4\33\2\0\1\33"+
"\2\64\11\0\1\64\115\0\2\64\6\0\1\64\56\0"+ "\42\0\1\34\11\0\3\25\5\0\1\25\1\0\1\25"+
"\1\64\3\0\1\64\2\0\1\64\3\0\1\64\5\0"+ "\1\0\1\25\4\0\1\25\4\0\1\34\1\0\2\34"+
"\1\64\7\0\1\64\4\0\2\64\3\0\2\64\1\0"+ "\4\0\1\25\5\0\1\25\3\0\1\34\4\0\1\34"+
"\1\64\4\0\1\64\1\0\1\64\2\0\2\64\1\0"+ "\2\25\2\34\10\0\1\52\1\0\2\25\1\0\1\34"+
"\3\64\1\0\1\64\2\0\4\64\2\0\1\64\42\0"+ "\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+
"\1\65\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+ "\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+
"\1\25\4\0\1\25\4\0\1\65\1\0\2\65\4\0"+ "\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+
"\1\25\5\0\1\25\3\0\1\65\4\0\1\65\2\25"+ "\1\0\1\25\1\0\3\25\1\0\1\34\1\0\2\25"+
"\2\65\10\0\1\26\1\0\2\25\1\0\1\65\10\0"+ "\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+
"\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+
"\5\0\3\25\1\0\1\25\5\0\1\25\2\34\5\0"+
"\1\25\2\0\1\25\1\34\4\0\1\25\2\0\1\25"+
"\1\0\1\25\103\0\2\34\6\0\1\34\56\0\1\34"+
"\3\0\1\34\2\0\1\34\3\0\1\34\5\0\1\34"+
"\7\0\1\34\4\0\2\34\3\0\2\34\1\0\1\34"+
"\4\0\1\34\1\0\1\34\2\0\2\34\1\0\3\34"+
"\1\0\1\34\2\0\4\34\2\0\1\34\42\0\1\52"+
"\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+
"\4\0\1\25\4\0\1\52\1\0\2\52\4\0\1\25"+
"\5\0\1\25\3\0\1\52\4\0\1\52\2\25\2\52"+
"\10\0\1\52\1\0\2\25\1\0\1\52\10\0\1\25"+
"\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+
"\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+
"\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+
"\1\0\3\25\1\0\1\52\1\0\2\25\4\0\3\25"+
"\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+
"\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+
"\1\0\1\25\5\0\1\25\2\52\5\0\1\25\2\0"+
"\1\25\1\52\4\0\1\25\2\0\1\25\1\0\1\25"+
"\103\0\2\52\6\0\1\52\56\0\1\52\3\0\1\52"+
"\2\0\1\52\3\0\1\52\5\0\1\52\7\0\1\52"+
"\4\0\2\52\3\0\2\52\1\0\1\52\4\0\1\52"+
"\1\0\1\52\2\0\2\52\1\0\3\52\1\0\1\52"+
"\2\0\4\52\2\0\1\52\53\0\1\166\3\0\1\167"+
"\5\0\1\170\3\0\1\171\14\0\1\172\16\0\1\173"+
"\2\0\1\174\42\0\1\130\1\52\6\0\1\130\5\0"+
"\1\53\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+
"\1\25\4\0\1\25\4\0\1\53\1\0\2\53\4\0"+
"\1\25\5\0\1\25\3\0\1\53\4\0\1\53\2\25"+
"\2\53\12\0\2\25\1\0\1\53\10\0\1\25\24\0"+
"\1\25\11\0\2\25\2\0\5\25\2\0\2\25\4\0"+
"\6\25\1\0\2\25\4\0\5\25\1\0\5\25\1\0"+
"\2\25\1\0\3\25\1\0\4\25\1\0\5\25\2\0"+
"\1\25\1\0\1\25\1\0\3\25\2\0\1\25\1\0"+
"\1\25\1\0\1\25\2\0\1\25\17\0\1\25\3\0"+
"\1\25\5\0\2\25\3\0\1\25\4\0\3\25\4\0"+
"\1\25\1\0\1\25\2\0\1\25\1\0\2\25\4\0"+
"\1\25\1\0\1\25\3\0\2\25\1\0\1\25\5\0"+
"\3\25\1\0\1\25\10\0\1\25\4\0\1\25\10\0"+
"\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+ "\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+
"\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+ "\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+
"\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+ "\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+
"\1\25\1\0\3\25\1\0\1\65\1\0\2\25\4\0"+ "\1\25\1\0\3\25\1\0\1\53\1\0\2\25\4\0"+
"\3\25\1\0\1\25\10\0\1\25\1\0\2\25\21\0"+ "\3\25\1\0\1\25\10\0\1\25\1\0\2\25\21\0"+
"\1\25\3\0\1\25\5\0\1\25\32\0\15\25\5\0"+ "\1\25\3\0\1\25\5\0\1\25\32\0\15\25\5\0"+
"\3\25\1\0\1\25\5\0\1\25\2\65\5\0\1\25"+ "\3\25\1\0\1\25\5\0\1\25\2\53\5\0\1\25"+
"\2\0\1\25\1\65\4\0\1\25\2\0\1\25\1\0"+ "\2\0\1\25\1\53\4\0\1\25\2\0\1\25\1\0"+
"\1\25\103\0\2\65\6\0\1\65\56\0\1\65\3\0"+ "\1\25\103\0\2\53\6\0\1\53\56\0\1\53\3\0"+
"\1\65\2\0\1\65\3\0\1\65\5\0\1\65\7\0"+ "\1\53\2\0\1\53\3\0\1\53\5\0\1\53\7\0"+
"\1\65\4\0\2\65\3\0\2\65\1\0\1\65\4\0"+ "\1\53\4\0\2\53\3\0\2\53\1\0\1\53\4\0"+
"\1\65\1\0\1\65\2\0\2\65\1\0\3\65\1\0"+ "\1\53\1\0\1\53\2\0\2\53\1\0\3\53\1\0"+
"\1\65\2\0\4\65\2\0\1\65\42\0\1\103\37\0"+ "\1\53\2\0\4\53\2\0\1\53\42\0\1\54\11\0"+
"\1\103\1\0\2\103\16\0\1\103\4\0\1\103\2\0"+ "\3\25\5\0\1\25\1\0\1\25\1\0\1\25\4\0"+
"\2\103\15\0\1\103\132\0\1\103\153\0\2\103\11\0"+ "\1\25\4\0\1\54\1\0\2\54\4\0\1\25\5\0"+
"\1\103\115\0\2\103\6\0\1\103\56\0\1\103\3\0"+ "\1\25\3\0\1\54\4\0\1\54\2\25\2\54\10\0"+
"\1\103\2\0\1\103\3\0\1\103\5\0\1\103\7\0"+ "\1\52\1\0\2\25\1\0\1\54\10\0\1\25\24\0"+
"\1\103\4\0\2\103\3\0\2\103\1\0\1\103\4\0"+ "\1\25\3\0\1\25\6\0\2\25\5\0\1\25\1\0"+
"\1\103\1\0\1\103\2\0\2\103\1\0\3\103\1\0"+ "\1\25\1\0\1\25\1\0\11\25\2\0\1\25\4\0"+
"\1\103\2\0\4\103\2\0\1\103\42\0\1\116\37\0"+ "\1\25\4\0\6\25\2\0\1\25\1\0\1\25\1\0"+
"\1\116\1\0\2\116\16\0\1\116\4\0\1\116\2\0"+ "\3\25\1\0\1\54\1\0\2\25\4\0\3\25\1\0"+
"\2\116\10\0\1\52\4\0\1\116\37\0\1\52\102\0"+ "\1\25\10\0\1\25\1\0\2\25\21\0\1\25\3\0"+
"\1\52\147\0\2\52\134\0\1\116\153\0\2\116\11\0"+ "\1\25\5\0\1\25\32\0\15\25\5\0\3\25\1\0"+
"\1\116\115\0\2\116\6\0\1\116\56\0\1\116\3\0"+ "\1\25\5\0\1\25\2\54\5\0\1\25\2\0\1\25"+
"\1\116\2\0\1\116\3\0\1\116\5\0\1\116\7\0"+ "\1\54\4\0\1\25\2\0\1\25\1\0\1\25\103\0"+
"\1\116\4\0\2\116\3\0\2\116\1\0\1\116\4\0"+ "\2\54\6\0\1\54\56\0\1\54\3\0\1\54\2\0"+
"\1\116\1\0\1\116\2\0\2\116\1\0\3\116\1\0"+ "\1\54\3\0\1\54\5\0\1\54\7\0\1\54\4\0"+
"\1\116\2\0\4\116\2\0\1\116\40\0"; "\2\54\3\0\2\54\1\0\1\54\4\0\1\54\1\0"+
"\1\54\2\0\2\54\1\0\3\54\1\0\1\54\2\0"+
"\4\54\2\0\1\54\42\0\1\64\37\0\1\64\1\0"+
"\2\64\16\0\1\64\4\0\1\64\2\0\2\64\10\0"+
"\1\26\4\0\1\64\37\0\1\26\102\0\1\26\147\0"+
"\2\26\134\0\1\64\153\0\2\64\11\0\1\64\115\0"+
"\2\64\6\0\1\64\56\0\1\64\3\0\1\64\2\0"+
"\1\64\3\0\1\64\5\0\1\64\7\0\1\64\4\0"+
"\2\64\3\0\2\64\1\0\1\64\4\0\1\64\1\0"+
"\1\64\2\0\2\64\1\0\3\64\1\0\1\64\2\0"+
"\4\64\2\0\1\64\42\0\1\65\11\0\3\25\5\0"+
"\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+
"\1\65\1\0\2\65\4\0\1\25\5\0\1\25\3\0"+
"\1\65\4\0\1\65\2\25\2\65\10\0\1\26\1\0"+
"\2\25\1\0\1\65\10\0\1\25\24\0\1\25\3\0"+
"\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+
"\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+
"\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+
"\1\65\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+
"\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+
"\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+
"\1\25\2\65\5\0\1\25\2\0\1\25\1\65\4\0"+
"\1\25\2\0\1\25\1\0\1\25\103\0\2\65\6\0"+
"\1\65\56\0\1\65\3\0\1\65\2\0\1\65\3\0"+
"\1\65\5\0\1\65\7\0\1\65\4\0\2\65\3\0"+
"\2\65\1\0\1\65\4\0\1\65\1\0\1\65\2\0"+
"\2\65\1\0\3\65\1\0\1\65\2\0\4\65\2\0"+
"\1\65\42\0\1\103\37\0\1\103\1\0\2\103\16\0"+
"\1\103\4\0\1\103\2\0\2\103\15\0\1\103\132\0"+
"\1\103\153\0\2\103\11\0\1\103\115\0\2\103\6\0"+
"\1\103\56\0\1\103\3\0\1\103\2\0\1\103\3\0"+
"\1\103\5\0\1\103\7\0\1\103\4\0\2\103\3\0"+
"\2\103\1\0\1\103\4\0\1\103\1\0\1\103\2\0"+
"\2\103\1\0\3\103\1\0\1\103\2\0\4\103\2\0"+
"\1\103\42\0\1\130\37\0\1\130\1\0\2\130\16\0"+
"\1\130\4\0\1\130\2\0\2\130\10\0\1\52\4\0"+
"\1\130\37\0\1\52\102\0\1\52\147\0\2\52\134\0"+
"\1\130\153\0\2\130\11\0\1\130\115\0\2\130\6\0"+
"\1\130\56\0\1\130\3\0\1\130\2\0\1\130\3\0"+
"\1\130\5\0\1\130\7\0\1\130\4\0\2\130\3\0"+
"\2\130\1\0\1\130\4\0\1\130\1\0\1\130\2\0"+
"\2\130\1\0\3\130\1\0\1\130\2\0\4\130\2\0"+
"\1\130\40\0";
private static int [] zzUnpackTrans() { private static int [] zzUnpackTrans() {
int [] result = new int[10609]; int [] result = new int[11845];
int offset = 0; int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result; return result;
@ -621,11 +640,11 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 = private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\27\1\2\11\1\1\15\0\1\1\1\0"+ "\1\0\1\11\32\1\15\0\1\1\1\0\1\1\10\0"+
"\1\1\10\0\1\1\15\0\1\1\57\0"; "\1\1\15\0\1\1\71\0";
private static int [] zzUnpackAttribute() { private static int [] zzUnpackAttribute() {
int [] result = new int[114]; int [] result = new int[124];
int offset = 0; int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result; return result;

View File

@ -71,6 +71,8 @@ MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})* MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})* ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
%{ %{
/** Alphanumeric sequences */ /** Alphanumeric sequences */
@ -178,8 +180,8 @@ ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
// UAX#29 WB14. Any ÷ Any // UAX#29 WB14. Any ÷ Any
// //
{Han} { return IDEOGRAPHIC_TYPE; } {HanEx} { return IDEOGRAPHIC_TYPE; }
{Hiragana} { return HIRAGANA_TYPE; } {HiraganaEx} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF // UAX#29 WB3. CR × LF

View File

@ -22,7 +22,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.Reader; import java.io.Reader;
import java.io.IOException; import java.io.IOException;
interface StandardTokenizerInterface { /** @lucene.internal */
public interface StandardTokenizerInterface {
/** This character denotes the end of file */ /** This character denotes the end of file */
public static final int YYEOF = -1; public static final int YYEOF = -1;
@ -30,12 +31,12 @@ interface StandardTokenizerInterface {
/** /**
* Copies the matched text into the CharTermAttribute * Copies the matched text into the CharTermAttribute
*/ */
void getText(CharTermAttribute t); public void getText(CharTermAttribute t);
/** /**
* Returns the current position. * Returns the current position.
*/ */
int yychar(); public int yychar();
/** /**
* Resets the scanner to read from a new input stream. * Resets the scanner to read from a new input stream.
@ -47,12 +48,12 @@ interface StandardTokenizerInterface {
* *
* @param reader the new input stream * @param reader the new input stream
*/ */
void yyreset(Reader reader); public void yyreset(Reader reader);
/** /**
* Returns the length of the matched text region. * Returns the length of the matched text region.
*/ */
int yylength(); public int yylength();
/** /**
* Resumes scanning until the next regular expression is matched, * Resumes scanning until the next regular expression is matched,
@ -61,6 +62,6 @@ interface StandardTokenizerInterface {
* @return the next token, {@link #YYEOF} on end of stream * @return the next token, {@link #YYEOF} on end of stream
* @exception IOException if any I/O-Error occurs * @exception IOException if any I/O-Error occurs
*/ */
int getNextToken() throws IOException; public int getNextToken() throws IOException;
} }

View File

@ -0,0 +1,184 @@
package org.apache.lucene.analysis.standard.std31;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements StandardTokenizer, except with a bug
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
* characters would be split from combining characters:
* @deprecated This class is only for exact backwards compatibility
*/
@Deprecated
%%
%unicode 6.0
%integer
%final
%public
%class StandardTokenizerImpl31
%implements StandardTokenizerInterface
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{Han} { return IDEOGRAPHIC_TYPE; }
{Hiragana} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -6,6 +6,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -221,6 +222,23 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" }); new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
} }
public void testCombiningMarks() throws Exception {
checkOneTerm(a, "ざ", "ざ"); // hiragana
checkOneTerm(a, "ザ", "ザ"); // katakana
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
Analyzer a = new StandardAnalyzer(Version.LUCENE_33);
checkOneTerm(a, "ざ", ""); // hiragana Bug
checkOneTerm(a, "ザ", "ザ"); // katakana Works
checkOneTerm(a, "壹゙", ""); // ideographic Bug
checkOneTerm(a, "아゙", "아゙"); // hangul Works
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);