mirror of https://github.com/apache/lucene.git
LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached to Han/Hiragana
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1154005 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ca05564005
commit
2dda5bc35f
|
@ -535,6 +535,11 @@ Bug fixes
|
||||||
suppressed exceptions in the original exception, so stack trace
|
suppressed exceptions in the original exception, so stack trace
|
||||||
will contain them. (Uwe Schindler)
|
will contain them. (Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached
|
||||||
|
to Han or Hiragana characters, this is fixed if you supply Version >= 3.4
|
||||||
|
If you supply a previous lucene version, you get the old buggy behavior
|
||||||
|
for backwards compatibility. (Trejkaz, Robert Muir)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-3290: Added FieldInvertState.numUniqueTerms
|
* LUCENE-3290: Added FieldInvertState.numUniqueTerms
|
||||||
|
|
|
@ -67,6 +67,9 @@
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
|
<jflex file="src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex"
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||||
nobak="on" />
|
nobak="on" />
|
||||||
|
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex"
|
||||||
|
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
||||||
|
nobak="on" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
||||||
|
|
|
@ -39,6 +39,9 @@ import java.util.Set;
|
||||||
* <p>You must specify the required {@link Version}
|
* <p>You must specify the required {@link Version}
|
||||||
* compatibility when creating StandardAnalyzer:
|
* compatibility when creating StandardAnalyzer:
|
||||||
* <ul>
|
* <ul>
|
||||||
|
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
|
||||||
|
* from their combining characters. If you use a previous version number,
|
||||||
|
* you get the exact broken behavior for backwards compatibility.
|
||||||
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
|
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
|
||||||
* and StopFilter correctly handles Unicode 4.0 supplementary characters
|
* and StopFilter correctly handles Unicode 4.0 supplementary characters
|
||||||
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
|
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
@ -42,6 +43,9 @@ import org.apache.lucene.util.Version;
|
||||||
* <p>You must specify the required {@link Version}
|
* <p>You must specify the required {@link Version}
|
||||||
* compatibility when creating StandardTokenizer:
|
* compatibility when creating StandardTokenizer:
|
||||||
* <ul>
|
* <ul>
|
||||||
|
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
|
||||||
|
* from their combining characters. If you use a previous version number,
|
||||||
|
* you get the exact broken behavior for backwards compatibility.
|
||||||
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
|
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
|
||||||
* If you use a previous version number, you get the exact behavior of
|
* If you use a previous version number, you get the exact behavior of
|
||||||
* {@link ClassicTokenizer} for backwards compatibility.
|
* {@link ClassicTokenizer} for backwards compatibility.
|
||||||
|
@ -142,8 +146,13 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final void init(Reader input, Version matchVersion) {
|
private final void init(Reader input, Version matchVersion) {
|
||||||
this.scanner = matchVersion.onOrAfter(Version.LUCENE_31) ?
|
if (matchVersion.onOrAfter(Version.LUCENE_34)) {
|
||||||
new StandardTokenizerImpl(input) : new ClassicTokenizerImpl(input);
|
this.scanner = new StandardTokenizerImpl(input);
|
||||||
|
} else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||||
|
this.scanner = new StandardTokenizerImpl31(input);
|
||||||
|
} else {
|
||||||
|
this.scanner = new ClassicTokenizerImpl(input);
|
||||||
|
}
|
||||||
this.input = input;
|
this.input = input;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 4:07 PM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
@ -209,10 +209,10 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
||||||
private static final String ZZ_ACTION_PACKED_0 =
|
private static final String ZZ_ACTION_PACKED_0 =
|
||||||
"\1\0\23\1\1\2\1\3\1\4\1\1\1\5\1\6"+
|
"\1\0\23\1\1\2\1\3\1\4\1\1\1\5\1\6"+
|
||||||
"\1\7\1\10\15\0\1\2\1\0\1\2\10\0\1\3"+
|
"\1\7\1\10\15\0\1\2\1\0\1\2\10\0\1\3"+
|
||||||
"\15\0\1\2\57\0";
|
"\15\0\1\2\71\0";
|
||||||
|
|
||||||
private static int [] zzUnpackAction() {
|
private static int [] zzUnpackAction() {
|
||||||
int [] result = new int[114];
|
int [] result = new int[124];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -240,21 +240,22 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
||||||
"\0\0\0\147\0\316\0\u0135\0\u019c\0\u0203\0\u026a\0\u02d1"+
|
"\0\0\0\147\0\316\0\u0135\0\u019c\0\u0203\0\u026a\0\u02d1"+
|
||||||
"\0\u0338\0\u039f\0\u0406\0\u046d\0\u04d4\0\u053b\0\u05a2\0\u0609"+
|
"\0\u0338\0\u039f\0\u0406\0\u046d\0\u04d4\0\u053b\0\u05a2\0\u0609"+
|
||||||
"\0\u0670\0\u06d7\0\u073e\0\u07a5\0\u080c\0\u0873\0\u08da\0\u0941"+
|
"\0\u0670\0\u06d7\0\u073e\0\u07a5\0\u080c\0\u0873\0\u08da\0\u0941"+
|
||||||
"\0\u09a8\0\147\0\147\0\u0a0f\0\316\0\u0135\0\u019c\0\u0203"+
|
"\0\u09a8\0\u0a0f\0\u0a76\0\u0add\0\316\0\u0135\0\u019c\0\u0203"+
|
||||||
"\0\u026a\0\u0a76\0\u0add\0\u0b44\0\u0bab\0\u046d\0\u0c12\0\u0c79"+
|
"\0\u026a\0\u0b44\0\u0bab\0\u0c12\0\u0c79\0\u046d\0\u0ce0\0\u0d47"+
|
||||||
"\0\u0ce0\0\u0d47\0\u0dae\0\u0e15\0\u0e7c\0\u0338\0\u039f\0\u0ee3"+
|
"\0\u0dae\0\u0e15\0\u0e7c\0\u0ee3\0\u0f4a\0\u0338\0\u039f\0\u0fb1"+
|
||||||
"\0\u0f4a\0\u0fb1\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b"+
|
"\0\u1018\0\u107f\0\u10e6\0\u114d\0\u11b4\0\u121b\0\u1282\0\u12e9"+
|
||||||
"\0\u1282\0\u12e9\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553"+
|
"\0\u1350\0\u13b7\0\u141e\0\u1485\0\u14ec\0\u1553\0\u15ba\0\u1621"+
|
||||||
"\0\u15ba\0\u0941\0\u1621\0\u1688\0\u16ef\0\u1756\0\u17bd\0\u1824"+
|
"\0\u1688\0\u0941\0\u16ef\0\u1756\0\u17bd\0\u1824\0\u188b\0\u18f2"+
|
||||||
"\0\u188b\0\u18f2\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c"+
|
"\0\u1959\0\u19c0\0\u1a27\0\u1a8e\0\u1af5\0\u1b5c\0\u1bc3\0\u1c2a"+
|
||||||
"\0\u1bc3\0\u1c2a\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94"+
|
"\0\u1c91\0\u1cf8\0\u1d5f\0\u1dc6\0\u1e2d\0\u1e94\0\u1efb\0\u1f62"+
|
||||||
"\0\u1efb\0\u1f62\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc"+
|
"\0\u1fc9\0\u2030\0\u2097\0\u20fe\0\u2165\0\u21cc\0\u2233\0\u229a"+
|
||||||
"\0\u2233\0\u229a\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504"+
|
"\0\u2301\0\u2368\0\u23cf\0\u2436\0\u249d\0\u2504\0\u256b\0\u25d2"+
|
||||||
"\0\u256b\0\u25d2\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c"+
|
"\0\u2639\0\u26a0\0\u2707\0\u276e\0\u27d5\0\u283c\0\u28a3\0\u290a"+
|
||||||
"\0\u28a3\0\u290a";
|
"\0\u2971\0\u29d8\0\u2a3f\0\u2aa6\0\u2b0d\0\u2b74\0\u2bdb\0\u2c42"+
|
||||||
|
"\0\u2ca9\0\u2d10\0\u2d77\0\u2dde";
|
||||||
|
|
||||||
private static int [] zzUnpackRowMap() {
|
private static int [] zzUnpackRowMap() {
|
||||||
int [] result = new int[114];
|
int [] result = new int[124];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -367,223 +368,241 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
||||||
"\1\57\3\0\1\75\11\0\1\46\2\0\1\76\16\0"+
|
"\1\57\3\0\1\75\11\0\1\46\2\0\1\76\16\0"+
|
||||||
"\1\77\2\0\1\100\21\0\1\101\17\0\1\25\1\102"+
|
"\1\77\2\0\1\100\21\0\1\101\17\0\1\25\1\102"+
|
||||||
"\1\26\1\103\3\0\1\102\1\0\1\102\2\0\1\25"+
|
"\1\26\1\103\3\0\1\102\1\0\1\102\2\0\1\25"+
|
||||||
"\142\0\2\31\4\0\1\35\1\0\1\36\1\0\1\37"+
|
"\142\0\2\31\16\0\1\104\15\0\1\105\14\0\1\106"+
|
||||||
"\1\0\1\40\1\0\1\41\1\0\1\104\3\0\1\43"+
|
"\16\0\1\107\2\0\1\110\42\0\1\32\7\0\1\32"+
|
||||||
"\5\0\1\44\3\0\1\105\11\0\1\46\2\0\1\106"+
|
"\16\0\1\111\15\0\1\112\14\0\1\113\16\0\1\114"+
|
||||||
"\16\0\1\107\2\0\1\110\41\0\1\25\1\34\1\52"+
|
"\2\0\1\115\42\0\1\33\7\0\1\33\4\0\1\35"+
|
||||||
"\1\0\1\53\1\0\1\53\1\54\1\0\1\34\2\0"+
|
|
||||||
"\1\34\2\0\1\25\11\0\3\25\5\0\1\25\1\0"+
|
|
||||||
"\1\25\1\0\1\25\4\0\1\25\4\0\1\25\1\0"+
|
|
||||||
"\2\25\4\0\1\25\5\0\1\25\3\0\1\25\4\0"+
|
|
||||||
"\5\25\10\0\1\52\1\0\2\25\1\0\1\25\10\0"+
|
|
||||||
"\1\25\24\0\1\25\1\0\1\52\7\0\2\25\2\0"+
|
|
||||||
"\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+
|
|
||||||
"\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+
|
|
||||||
"\4\25\1\0\5\25\1\52\1\0\1\25\1\0\1\25"+
|
|
||||||
"\1\0\3\25\2\0\1\25\1\0\1\25\1\0\1\25"+
|
|
||||||
"\2\0\1\25\17\0\1\25\3\0\1\25\5\0\2\25"+
|
|
||||||
"\3\0\1\25\4\0\3\25\4\0\1\25\1\0\1\25"+
|
|
||||||
"\2\0\1\25\1\0\2\25\4\0\1\25\1\0\1\25"+
|
|
||||||
"\3\0\2\25\1\0\1\25\5\0\3\25\1\0\1\25"+
|
|
||||||
"\10\0\1\25\1\0\2\52\1\0\1\25\10\0\1\25"+
|
|
||||||
"\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+
|
|
||||||
"\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+
|
|
||||||
"\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+
|
|
||||||
"\1\0\3\25\1\0\1\25\1\0\2\25\4\0\3\25"+
|
|
||||||
"\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+
|
|
||||||
"\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+
|
|
||||||
"\1\0\1\25\5\0\3\25\5\0\1\25\2\0\2\25"+
|
|
||||||
"\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\25"+
|
|
||||||
"\6\0\1\25\56\0\1\25\3\0\1\25\2\0\1\25"+
|
|
||||||
"\3\0\1\25\5\0\1\25\7\0\1\25\4\0\2\25"+
|
|
||||||
"\3\0\2\25\1\0\1\25\4\0\1\25\1\0\1\25"+
|
|
||||||
"\2\0\2\25\1\0\3\25\1\0\1\25\2\0\4\25"+
|
|
||||||
"\2\0\1\25\41\0\1\35\1\0\1\36\1\0\1\37"+
|
|
||||||
"\1\0\1\40\1\0\1\41\1\0\1\111\3\0\1\43"+
|
|
||||||
"\5\0\1\44\3\0\1\112\11\0\1\46\2\0\1\113"+
|
|
||||||
"\16\0\1\114\2\0\1\115\41\0\1\25\2\52\2\0"+
|
|
||||||
"\2\116\1\54\1\0\1\52\2\0\1\25\1\0\1\35"+
|
|
||||||
"\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
|
"\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
|
||||||
"\1\0\1\117\3\0\1\120\5\0\1\121\3\0\1\122"+
|
"\1\0\1\116\3\0\1\43\5\0\1\44\3\0\1\117"+
|
||||||
"\11\0\1\46\2\0\1\123\16\0\1\124\2\0\1\125"+
|
"\11\0\1\46\2\0\1\120\16\0\1\121\2\0\1\122"+
|
||||||
"\41\0\1\25\1\53\7\0\1\53\2\0\1\25\1\0"+
|
"\41\0\1\25\1\34\1\52\1\0\1\53\1\0\1\53"+
|
||||||
"\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+
|
"\1\54\1\0\1\34\2\0\1\34\2\0\1\25\11\0"+
|
||||||
"\1\41\1\0\1\126\3\0\1\43\5\0\1\44\3\0"+
|
"\3\25\5\0\1\25\1\0\1\25\1\0\1\25\4\0"+
|
||||||
"\1\127\11\0\1\46\2\0\1\130\16\0\1\131\2\0"+
|
"\1\25\4\0\1\25\1\0\2\25\4\0\1\25\5\0"+
|
||||||
"\1\132\21\0\1\101\17\0\1\25\1\54\1\52\1\103"+
|
"\1\25\3\0\1\25\4\0\5\25\10\0\1\52\1\0"+
|
||||||
"\3\0\1\54\1\0\1\54\2\0\1\25\2\0\1\26"+
|
"\2\25\1\0\1\25\10\0\1\25\24\0\1\25\1\0"+
|
||||||
"\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+
|
"\1\52\7\0\2\25\2\0\5\25\2\0\2\25\4\0"+
|
||||||
"\4\0\1\25\4\0\1\26\1\0\2\26\4\0\1\25"+
|
"\6\25\1\0\2\25\4\0\5\25\1\0\5\25\1\0"+
|
||||||
"\5\0\1\25\3\0\1\26\4\0\1\26\2\25\2\26"+
|
"\2\25\1\0\3\25\1\0\4\25\1\0\5\25\1\52"+
|
||||||
"\10\0\1\26\1\0\2\25\1\0\1\26\10\0\1\25"+
|
"\1\0\1\25\1\0\1\25\1\0\3\25\2\0\1\25"+
|
||||||
"\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+
|
"\1\0\1\25\1\0\1\25\2\0\1\25\17\0\1\25"+
|
||||||
"\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+
|
"\3\0\1\25\5\0\2\25\3\0\1\25\4\0\3\25"+
|
||||||
"\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+
|
"\4\0\1\25\1\0\1\25\2\0\1\25\1\0\2\25"+
|
||||||
"\1\0\3\25\1\0\1\26\1\0\2\25\4\0\3\25"+
|
"\4\0\1\25\1\0\1\25\3\0\2\25\1\0\1\25"+
|
||||||
"\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+
|
"\5\0\3\25\1\0\1\25\10\0\1\25\1\0\2\52"+
|
||||||
"\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+
|
"\1\0\1\25\10\0\1\25\24\0\1\25\3\0\1\25"+
|
||||||
"\1\0\1\25\5\0\1\25\2\26\5\0\1\25\2\0"+
|
|
||||||
"\1\25\1\26\4\0\1\25\2\0\1\25\1\0\1\25"+
|
|
||||||
"\103\0\2\26\6\0\1\26\56\0\1\26\3\0\1\26"+
|
|
||||||
"\2\0\1\26\3\0\1\26\5\0\1\26\7\0\1\26"+
|
|
||||||
"\4\0\2\26\3\0\2\26\1\0\1\26\4\0\1\26"+
|
|
||||||
"\1\0\1\26\2\0\2\26\1\0\3\26\1\0\1\26"+
|
|
||||||
"\2\0\4\26\2\0\1\26\53\0\1\133\3\0\1\134"+
|
|
||||||
"\5\0\1\135\3\0\1\136\14\0\1\137\16\0\1\140"+
|
|
||||||
"\2\0\1\141\42\0\1\64\1\26\6\0\1\64\4\0"+
|
|
||||||
"\1\35\1\0\1\36\1\0\1\37\1\0\1\40\1\0"+
|
|
||||||
"\1\41\1\0\1\142\3\0\1\56\5\0\1\57\3\0"+
|
|
||||||
"\1\143\11\0\1\46\2\0\1\144\16\0\1\145\2\0"+
|
|
||||||
"\1\146\21\0\1\101\17\0\1\25\1\65\1\26\1\103"+
|
|
||||||
"\3\0\1\65\1\0\1\65\2\0\1\25\2\0\1\27"+
|
|
||||||
"\37\0\1\27\1\0\2\27\16\0\1\27\4\0\1\27"+
|
|
||||||
"\2\0\2\27\15\0\1\27\132\0\1\27\153\0\2\27"+
|
|
||||||
"\11\0\1\27\115\0\2\27\6\0\1\27\56\0\1\27"+
|
|
||||||
"\3\0\1\27\2\0\1\27\3\0\1\27\5\0\1\27"+
|
|
||||||
"\7\0\1\27\4\0\2\27\3\0\2\27\1\0\1\27"+
|
|
||||||
"\4\0\1\27\1\0\1\27\2\0\2\27\1\0\3\27"+
|
|
||||||
"\1\0\1\27\2\0\4\27\2\0\1\27\153\0\1\27"+
|
|
||||||
"\35\0\1\102\11\0\3\25\5\0\1\25\1\0\1\25"+
|
|
||||||
"\1\0\1\25\4\0\1\25\4\0\1\102\1\0\2\102"+
|
|
||||||
"\4\0\1\25\5\0\1\25\3\0\1\102\4\0\1\102"+
|
|
||||||
"\2\25\2\102\10\0\1\26\1\0\2\25\1\0\1\102"+
|
|
||||||
"\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+
|
|
||||||
"\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+
|
|
||||||
"\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+
|
|
||||||
"\1\0\1\25\1\0\3\25\1\0\1\102\1\0\2\25"+
|
|
||||||
"\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+
|
|
||||||
"\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+
|
|
||||||
"\5\0\3\25\1\0\1\25\5\0\1\25\2\102\5\0"+
|
|
||||||
"\1\25\2\0\1\25\1\102\4\0\1\25\2\0\1\25"+
|
|
||||||
"\1\0\1\25\103\0\2\102\6\0\1\102\56\0\1\102"+
|
|
||||||
"\3\0\1\102\2\0\1\102\3\0\1\102\5\0\1\102"+
|
|
||||||
"\7\0\1\102\4\0\2\102\3\0\2\102\1\0\1\102"+
|
|
||||||
"\4\0\1\102\1\0\1\102\2\0\2\102\1\0\3\102"+
|
|
||||||
"\1\0\1\102\2\0\4\102\2\0\1\102\153\0\1\103"+
|
|
||||||
"\46\0\1\147\15\0\1\150\14\0\1\151\16\0\1\152"+
|
|
||||||
"\2\0\1\153\21\0\1\101\20\0\1\103\1\0\1\103"+
|
|
||||||
"\3\0\1\54\1\0\1\103\5\0\1\34\11\0\3\25"+
|
|
||||||
"\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+
|
|
||||||
"\4\0\1\34\1\0\2\34\4\0\1\25\5\0\1\25"+
|
|
||||||
"\3\0\1\34\4\0\1\34\2\25\2\34\10\0\1\52"+
|
|
||||||
"\1\0\2\25\1\0\1\34\10\0\1\25\24\0\1\25"+
|
|
||||||
"\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+
|
|
||||||
"\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+
|
|
||||||
"\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+
|
|
||||||
"\1\0\1\34\1\0\2\25\4\0\3\25\1\0\1\25"+
|
|
||||||
"\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+
|
|
||||||
"\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+
|
|
||||||
"\5\0\1\25\2\34\5\0\1\25\2\0\1\25\1\34"+
|
|
||||||
"\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\34"+
|
|
||||||
"\6\0\1\34\56\0\1\34\3\0\1\34\2\0\1\34"+
|
|
||||||
"\3\0\1\34\5\0\1\34\7\0\1\34\4\0\2\34"+
|
|
||||||
"\3\0\2\34\1\0\1\34\4\0\1\34\1\0\1\34"+
|
|
||||||
"\2\0\2\34\1\0\3\34\1\0\1\34\2\0\4\34"+
|
|
||||||
"\2\0\1\34\42\0\1\52\11\0\3\25\5\0\1\25"+
|
|
||||||
"\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\52"+
|
|
||||||
"\1\0\2\52\4\0\1\25\5\0\1\25\3\0\1\52"+
|
|
||||||
"\4\0\1\52\2\25\2\52\10\0\1\52\1\0\2\25"+
|
|
||||||
"\1\0\1\52\10\0\1\25\24\0\1\25\3\0\1\25"+
|
|
||||||
"\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+
|
"\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+
|
||||||
"\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+
|
"\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+
|
||||||
"\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\52"+
|
"\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\25"+
|
||||||
|
"\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+
|
||||||
|
"\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+
|
||||||
|
"\32\0\15\25\5\0\3\25\1\0\1\25\5\0\3\25"+
|
||||||
|
"\5\0\1\25\2\0\2\25\4\0\1\25\2\0\1\25"+
|
||||||
|
"\1\0\1\25\103\0\2\25\6\0\1\25\56\0\1\25"+
|
||||||
|
"\3\0\1\25\2\0\1\25\3\0\1\25\5\0\1\25"+
|
||||||
|
"\7\0\1\25\4\0\2\25\3\0\2\25\1\0\1\25"+
|
||||||
|
"\4\0\1\25\1\0\1\25\2\0\2\25\1\0\3\25"+
|
||||||
|
"\1\0\1\25\2\0\4\25\2\0\1\25\41\0\1\35"+
|
||||||
|
"\1\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
|
||||||
|
"\1\0\1\123\3\0\1\43\5\0\1\44\3\0\1\124"+
|
||||||
|
"\11\0\1\46\2\0\1\125\16\0\1\126\2\0\1\127"+
|
||||||
|
"\41\0\1\25\2\52\2\0\2\130\1\54\1\0\1\52"+
|
||||||
|
"\2\0\1\25\1\0\1\35\1\0\1\36\1\0\1\37"+
|
||||||
|
"\1\0\1\40\1\0\1\41\1\0\1\131\3\0\1\132"+
|
||||||
|
"\5\0\1\133\3\0\1\134\11\0\1\46\2\0\1\135"+
|
||||||
|
"\16\0\1\136\2\0\1\137\41\0\1\25\1\53\7\0"+
|
||||||
|
"\1\53\2\0\1\25\1\0\1\35\1\0\1\36\1\0"+
|
||||||
|
"\1\37\1\0\1\40\1\0\1\41\1\0\1\140\3\0"+
|
||||||
|
"\1\43\5\0\1\44\3\0\1\141\11\0\1\46\2\0"+
|
||||||
|
"\1\142\16\0\1\143\2\0\1\144\21\0\1\101\17\0"+
|
||||||
|
"\1\25\1\54\1\52\1\103\3\0\1\54\1\0\1\54"+
|
||||||
|
"\2\0\1\25\2\0\1\26\11\0\3\25\5\0\1\25"+
|
||||||
|
"\1\0\1\25\1\0\1\25\4\0\1\25\4\0\1\26"+
|
||||||
|
"\1\0\2\26\4\0\1\25\5\0\1\25\3\0\1\26"+
|
||||||
|
"\4\0\1\26\2\25\2\26\10\0\1\26\1\0\2\25"+
|
||||||
|
"\1\0\1\26\10\0\1\25\24\0\1\25\3\0\1\25"+
|
||||||
|
"\6\0\2\25\5\0\1\25\1\0\1\25\1\0\1\25"+
|
||||||
|
"\1\0\11\25\2\0\1\25\4\0\1\25\4\0\6\25"+
|
||||||
|
"\2\0\1\25\1\0\1\25\1\0\3\25\1\0\1\26"+
|
||||||
"\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+
|
"\1\0\2\25\4\0\3\25\1\0\1\25\10\0\1\25"+
|
||||||
"\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+
|
"\1\0\2\25\21\0\1\25\3\0\1\25\5\0\1\25"+
|
||||||
"\32\0\15\25\5\0\3\25\1\0\1\25\5\0\1\25"+
|
"\32\0\15\25\5\0\3\25\1\0\1\25\5\0\1\25"+
|
||||||
"\2\52\5\0\1\25\2\0\1\25\1\52\4\0\1\25"+
|
"\2\26\5\0\1\25\2\0\1\25\1\26\4\0\1\25"+
|
||||||
"\2\0\1\25\1\0\1\25\103\0\2\52\6\0\1\52"+
|
"\2\0\1\25\1\0\1\25\103\0\2\26\6\0\1\26"+
|
||||||
"\56\0\1\52\3\0\1\52\2\0\1\52\3\0\1\52"+
|
"\56\0\1\26\3\0\1\26\2\0\1\26\3\0\1\26"+
|
||||||
"\5\0\1\52\7\0\1\52\4\0\2\52\3\0\2\52"+
|
"\5\0\1\26\7\0\1\26\4\0\2\26\3\0\2\26"+
|
||||||
"\1\0\1\52\4\0\1\52\1\0\1\52\2\0\2\52"+
|
"\1\0\1\26\4\0\1\26\1\0\1\26\2\0\2\26"+
|
||||||
"\1\0\3\52\1\0\1\52\2\0\4\52\2\0\1\52"+
|
"\1\0\3\26\1\0\1\26\2\0\4\26\2\0\1\26"+
|
||||||
"\53\0\1\154\3\0\1\155\5\0\1\156\3\0\1\157"+
|
"\53\0\1\145\3\0\1\146\5\0\1\147\3\0\1\150"+
|
||||||
"\14\0\1\160\16\0\1\161\2\0\1\162\42\0\1\116"+
|
"\14\0\1\151\16\0\1\152\2\0\1\153\42\0\1\64"+
|
||||||
"\1\52\6\0\1\116\5\0\1\53\11\0\3\25\5\0"+
|
"\1\26\6\0\1\64\4\0\1\35\1\0\1\36\1\0"+
|
||||||
"\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+
|
"\1\37\1\0\1\40\1\0\1\41\1\0\1\154\3\0"+
|
||||||
"\1\53\1\0\2\53\4\0\1\25\5\0\1\25\3\0"+
|
"\1\56\5\0\1\57\3\0\1\155\11\0\1\46\2\0"+
|
||||||
"\1\53\4\0\1\53\2\25\2\53\12\0\2\25\1\0"+
|
"\1\156\16\0\1\157\2\0\1\160\21\0\1\101\17\0"+
|
||||||
"\1\53\10\0\1\25\24\0\1\25\11\0\2\25\2\0"+
|
"\1\25\1\65\1\26\1\103\3\0\1\65\1\0\1\65"+
|
||||||
"\5\25\2\0\2\25\4\0\6\25\1\0\2\25\4\0"+
|
"\2\0\1\25\2\0\1\27\37\0\1\27\1\0\2\27"+
|
||||||
"\5\25\1\0\5\25\1\0\2\25\1\0\3\25\1\0"+
|
"\16\0\1\27\4\0\1\27\2\0\2\27\15\0\1\27"+
|
||||||
"\4\25\1\0\5\25\2\0\1\25\1\0\1\25\1\0"+
|
"\132\0\1\27\153\0\2\27\11\0\1\27\115\0\2\27"+
|
||||||
"\3\25\2\0\1\25\1\0\1\25\1\0\1\25\2\0"+
|
"\6\0\1\27\56\0\1\27\3\0\1\27\2\0\1\27"+
|
||||||
"\1\25\17\0\1\25\3\0\1\25\5\0\2\25\3\0"+
|
"\3\0\1\27\5\0\1\27\7\0\1\27\4\0\2\27"+
|
||||||
"\1\25\4\0\3\25\4\0\1\25\1\0\1\25\2\0"+
|
"\3\0\2\27\1\0\1\27\4\0\1\27\1\0\1\27"+
|
||||||
"\1\25\1\0\2\25\4\0\1\25\1\0\1\25\3\0"+
|
"\2\0\2\27\1\0\3\27\1\0\1\27\2\0\4\27"+
|
||||||
"\2\25\1\0\1\25\5\0\3\25\1\0\1\25\10\0"+
|
"\2\0\1\27\153\0\1\27\35\0\1\102\11\0\3\25"+
|
||||||
"\1\25\4\0\1\25\10\0\1\25\24\0\1\25\3\0"+
|
"\5\0\1\25\1\0\1\25\1\0\1\25\4\0\1\25"+
|
||||||
"\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+
|
"\4\0\1\102\1\0\2\102\4\0\1\25\5\0\1\25"+
|
||||||
"\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+
|
"\3\0\1\102\4\0\1\102\2\25\2\102\10\0\1\26"+
|
||||||
"\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+
|
"\1\0\2\25\1\0\1\102\10\0\1\25\24\0\1\25"+
|
||||||
"\1\53\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+
|
"\3\0\1\25\6\0\2\25\5\0\1\25\1\0\1\25"+
|
||||||
"\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+
|
"\1\0\1\25\1\0\11\25\2\0\1\25\4\0\1\25"+
|
||||||
"\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+
|
"\4\0\6\25\2\0\1\25\1\0\1\25\1\0\3\25"+
|
||||||
"\1\25\2\53\5\0\1\25\2\0\1\25\1\53\4\0"+
|
"\1\0\1\102\1\0\2\25\4\0\3\25\1\0\1\25"+
|
||||||
"\1\25\2\0\1\25\1\0\1\25\103\0\2\53\6\0"+
|
"\10\0\1\25\1\0\2\25\21\0\1\25\3\0\1\25"+
|
||||||
"\1\53\56\0\1\53\3\0\1\53\2\0\1\53\3\0"+
|
"\5\0\1\25\32\0\15\25\5\0\3\25\1\0\1\25"+
|
||||||
"\1\53\5\0\1\53\7\0\1\53\4\0\2\53\3\0"+
|
"\5\0\1\25\2\102\5\0\1\25\2\0\1\25\1\102"+
|
||||||
"\2\53\1\0\1\53\4\0\1\53\1\0\1\53\2\0"+
|
"\4\0\1\25\2\0\1\25\1\0\1\25\103\0\2\102"+
|
||||||
"\2\53\1\0\3\53\1\0\1\53\2\0\4\53\2\0"+
|
"\6\0\1\102\56\0\1\102\3\0\1\102\2\0\1\102"+
|
||||||
"\1\53\42\0\1\54\11\0\3\25\5\0\1\25\1\0"+
|
"\3\0\1\102\5\0\1\102\7\0\1\102\4\0\2\102"+
|
||||||
"\1\25\1\0\1\25\4\0\1\25\4\0\1\54\1\0"+
|
"\3\0\2\102\1\0\1\102\4\0\1\102\1\0\1\102"+
|
||||||
"\2\54\4\0\1\25\5\0\1\25\3\0\1\54\4\0"+
|
"\2\0\2\102\1\0\3\102\1\0\1\102\2\0\4\102"+
|
||||||
"\1\54\2\25\2\54\10\0\1\52\1\0\2\25\1\0"+
|
"\2\0\1\102\153\0\1\103\46\0\1\161\15\0\1\162"+
|
||||||
"\1\54\10\0\1\25\24\0\1\25\3\0\1\25\6\0"+
|
"\14\0\1\163\16\0\1\164\2\0\1\165\21\0\1\101"+
|
||||||
"\2\25\5\0\1\25\1\0\1\25\1\0\1\25\1\0"+
|
"\20\0\1\103\1\0\1\103\3\0\1\54\1\0\1\103"+
|
||||||
"\11\25\2\0\1\25\4\0\1\25\4\0\6\25\2\0"+
|
"\5\0\1\32\37\0\1\32\1\0\2\32\16\0\1\32"+
|
||||||
"\1\25\1\0\1\25\1\0\3\25\1\0\1\54\1\0"+
|
"\4\0\1\32\2\0\2\32\15\0\1\32\132\0\1\32"+
|
||||||
"\2\25\4\0\3\25\1\0\1\25\10\0\1\25\1\0"+
|
"\153\0\2\32\11\0\1\32\115\0\2\32\6\0\1\32"+
|
||||||
"\2\25\21\0\1\25\3\0\1\25\5\0\1\25\32\0"+
|
"\56\0\1\32\3\0\1\32\2\0\1\32\3\0\1\32"+
|
||||||
"\15\25\5\0\3\25\1\0\1\25\5\0\1\25\2\54"+
|
"\5\0\1\32\7\0\1\32\4\0\2\32\3\0\2\32"+
|
||||||
"\5\0\1\25\2\0\1\25\1\54\4\0\1\25\2\0"+
|
"\1\0\1\32\4\0\1\32\1\0\1\32\2\0\2\32"+
|
||||||
"\1\25\1\0\1\25\103\0\2\54\6\0\1\54\56\0"+
|
"\1\0\3\32\1\0\1\32\2\0\4\32\2\0\1\32"+
|
||||||
"\1\54\3\0\1\54\2\0\1\54\3\0\1\54\5\0"+
|
"\42\0\1\33\37\0\1\33\1\0\2\33\16\0\1\33"+
|
||||||
"\1\54\7\0\1\54\4\0\2\54\3\0\2\54\1\0"+
|
"\4\0\1\33\2\0\2\33\15\0\1\33\132\0\1\33"+
|
||||||
"\1\54\4\0\1\54\1\0\1\54\2\0\2\54\1\0"+
|
"\153\0\2\33\11\0\1\33\115\0\2\33\6\0\1\33"+
|
||||||
"\3\54\1\0\1\54\2\0\4\54\2\0\1\54\42\0"+
|
"\56\0\1\33\3\0\1\33\2\0\1\33\3\0\1\33"+
|
||||||
"\1\64\37\0\1\64\1\0\2\64\16\0\1\64\4\0"+
|
"\5\0\1\33\7\0\1\33\4\0\2\33\3\0\2\33"+
|
||||||
"\1\64\2\0\2\64\10\0\1\26\4\0\1\64\37\0"+
|
"\1\0\1\33\4\0\1\33\1\0\1\33\2\0\2\33"+
|
||||||
"\1\26\102\0\1\26\147\0\2\26\134\0\1\64\153\0"+
|
"\1\0\3\33\1\0\1\33\2\0\4\33\2\0\1\33"+
|
||||||
"\2\64\11\0\1\64\115\0\2\64\6\0\1\64\56\0"+
|
"\42\0\1\34\11\0\3\25\5\0\1\25\1\0\1\25"+
|
||||||
"\1\64\3\0\1\64\2\0\1\64\3\0\1\64\5\0"+
|
"\1\0\1\25\4\0\1\25\4\0\1\34\1\0\2\34"+
|
||||||
"\1\64\7\0\1\64\4\0\2\64\3\0\2\64\1\0"+
|
"\4\0\1\25\5\0\1\25\3\0\1\34\4\0\1\34"+
|
||||||
"\1\64\4\0\1\64\1\0\1\64\2\0\2\64\1\0"+
|
"\2\25\2\34\10\0\1\52\1\0\2\25\1\0\1\34"+
|
||||||
"\3\64\1\0\1\64\2\0\4\64\2\0\1\64\42\0"+
|
"\10\0\1\25\24\0\1\25\3\0\1\25\6\0\2\25"+
|
||||||
"\1\65\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+
|
"\5\0\1\25\1\0\1\25\1\0\1\25\1\0\11\25"+
|
||||||
"\1\25\4\0\1\25\4\0\1\65\1\0\2\65\4\0"+
|
"\2\0\1\25\4\0\1\25\4\0\6\25\2\0\1\25"+
|
||||||
"\1\25\5\0\1\25\3\0\1\65\4\0\1\65\2\25"+
|
"\1\0\1\25\1\0\3\25\1\0\1\34\1\0\2\25"+
|
||||||
"\2\65\10\0\1\26\1\0\2\25\1\0\1\65\10\0"+
|
"\4\0\3\25\1\0\1\25\10\0\1\25\1\0\2\25"+
|
||||||
|
"\21\0\1\25\3\0\1\25\5\0\1\25\32\0\15\25"+
|
||||||
|
"\5\0\3\25\1\0\1\25\5\0\1\25\2\34\5\0"+
|
||||||
|
"\1\25\2\0\1\25\1\34\4\0\1\25\2\0\1\25"+
|
||||||
|
"\1\0\1\25\103\0\2\34\6\0\1\34\56\0\1\34"+
|
||||||
|
"\3\0\1\34\2\0\1\34\3\0\1\34\5\0\1\34"+
|
||||||
|
"\7\0\1\34\4\0\2\34\3\0\2\34\1\0\1\34"+
|
||||||
|
"\4\0\1\34\1\0\1\34\2\0\2\34\1\0\3\34"+
|
||||||
|
"\1\0\1\34\2\0\4\34\2\0\1\34\42\0\1\52"+
|
||||||
|
"\11\0\3\25\5\0\1\25\1\0\1\25\1\0\1\25"+
|
||||||
|
"\4\0\1\25\4\0\1\52\1\0\2\52\4\0\1\25"+
|
||||||
|
"\5\0\1\25\3\0\1\52\4\0\1\52\2\25\2\52"+
|
||||||
|
"\10\0\1\52\1\0\2\25\1\0\1\52\10\0\1\25"+
|
||||||
|
"\24\0\1\25\3\0\1\25\6\0\2\25\5\0\1\25"+
|
||||||
|
"\1\0\1\25\1\0\1\25\1\0\11\25\2\0\1\25"+
|
||||||
|
"\4\0\1\25\4\0\6\25\2\0\1\25\1\0\1\25"+
|
||||||
|
"\1\0\3\25\1\0\1\52\1\0\2\25\4\0\3\25"+
|
||||||
|
"\1\0\1\25\10\0\1\25\1\0\2\25\21\0\1\25"+
|
||||||
|
"\3\0\1\25\5\0\1\25\32\0\15\25\5\0\3\25"+
|
||||||
|
"\1\0\1\25\5\0\1\25\2\52\5\0\1\25\2\0"+
|
||||||
|
"\1\25\1\52\4\0\1\25\2\0\1\25\1\0\1\25"+
|
||||||
|
"\103\0\2\52\6\0\1\52\56\0\1\52\3\0\1\52"+
|
||||||
|
"\2\0\1\52\3\0\1\52\5\0\1\52\7\0\1\52"+
|
||||||
|
"\4\0\2\52\3\0\2\52\1\0\1\52\4\0\1\52"+
|
||||||
|
"\1\0\1\52\2\0\2\52\1\0\3\52\1\0\1\52"+
|
||||||
|
"\2\0\4\52\2\0\1\52\53\0\1\166\3\0\1\167"+
|
||||||
|
"\5\0\1\170\3\0\1\171\14\0\1\172\16\0\1\173"+
|
||||||
|
"\2\0\1\174\42\0\1\130\1\52\6\0\1\130\5\0"+
|
||||||
|
"\1\53\11\0\3\25\5\0\1\25\1\0\1\25\1\0"+
|
||||||
|
"\1\25\4\0\1\25\4\0\1\53\1\0\2\53\4\0"+
|
||||||
|
"\1\25\5\0\1\25\3\0\1\53\4\0\1\53\2\25"+
|
||||||
|
"\2\53\12\0\2\25\1\0\1\53\10\0\1\25\24\0"+
|
||||||
|
"\1\25\11\0\2\25\2\0\5\25\2\0\2\25\4\0"+
|
||||||
|
"\6\25\1\0\2\25\4\0\5\25\1\0\5\25\1\0"+
|
||||||
|
"\2\25\1\0\3\25\1\0\4\25\1\0\5\25\2\0"+
|
||||||
|
"\1\25\1\0\1\25\1\0\3\25\2\0\1\25\1\0"+
|
||||||
|
"\1\25\1\0\1\25\2\0\1\25\17\0\1\25\3\0"+
|
||||||
|
"\1\25\5\0\2\25\3\0\1\25\4\0\3\25\4\0"+
|
||||||
|
"\1\25\1\0\1\25\2\0\1\25\1\0\2\25\4\0"+
|
||||||
|
"\1\25\1\0\1\25\3\0\2\25\1\0\1\25\5\0"+
|
||||||
|
"\3\25\1\0\1\25\10\0\1\25\4\0\1\25\10\0"+
|
||||||
"\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+
|
"\1\25\24\0\1\25\3\0\1\25\6\0\2\25\5\0"+
|
||||||
"\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+
|
"\1\25\1\0\1\25\1\0\1\25\1\0\11\25\2\0"+
|
||||||
"\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+
|
"\1\25\4\0\1\25\4\0\6\25\2\0\1\25\1\0"+
|
||||||
"\1\25\1\0\3\25\1\0\1\65\1\0\2\25\4\0"+
|
"\1\25\1\0\3\25\1\0\1\53\1\0\2\25\4\0"+
|
||||||
"\3\25\1\0\1\25\10\0\1\25\1\0\2\25\21\0"+
|
"\3\25\1\0\1\25\10\0\1\25\1\0\2\25\21\0"+
|
||||||
"\1\25\3\0\1\25\5\0\1\25\32\0\15\25\5\0"+
|
"\1\25\3\0\1\25\5\0\1\25\32\0\15\25\5\0"+
|
||||||
"\3\25\1\0\1\25\5\0\1\25\2\65\5\0\1\25"+
|
"\3\25\1\0\1\25\5\0\1\25\2\53\5\0\1\25"+
|
||||||
"\2\0\1\25\1\65\4\0\1\25\2\0\1\25\1\0"+
|
"\2\0\1\25\1\53\4\0\1\25\2\0\1\25\1\0"+
|
||||||
"\1\25\103\0\2\65\6\0\1\65\56\0\1\65\3\0"+
|
"\1\25\103\0\2\53\6\0\1\53\56\0\1\53\3\0"+
|
||||||
"\1\65\2\0\1\65\3\0\1\65\5\0\1\65\7\0"+
|
"\1\53\2\0\1\53\3\0\1\53\5\0\1\53\7\0"+
|
||||||
"\1\65\4\0\2\65\3\0\2\65\1\0\1\65\4\0"+
|
"\1\53\4\0\2\53\3\0\2\53\1\0\1\53\4\0"+
|
||||||
"\1\65\1\0\1\65\2\0\2\65\1\0\3\65\1\0"+
|
"\1\53\1\0\1\53\2\0\2\53\1\0\3\53\1\0"+
|
||||||
"\1\65\2\0\4\65\2\0\1\65\42\0\1\103\37\0"+
|
"\1\53\2\0\4\53\2\0\1\53\42\0\1\54\11\0"+
|
||||||
"\1\103\1\0\2\103\16\0\1\103\4\0\1\103\2\0"+
|
"\3\25\5\0\1\25\1\0\1\25\1\0\1\25\4\0"+
|
||||||
"\2\103\15\0\1\103\132\0\1\103\153\0\2\103\11\0"+
|
"\1\25\4\0\1\54\1\0\2\54\4\0\1\25\5\0"+
|
||||||
"\1\103\115\0\2\103\6\0\1\103\56\0\1\103\3\0"+
|
"\1\25\3\0\1\54\4\0\1\54\2\25\2\54\10\0"+
|
||||||
"\1\103\2\0\1\103\3\0\1\103\5\0\1\103\7\0"+
|
"\1\52\1\0\2\25\1\0\1\54\10\0\1\25\24\0"+
|
||||||
"\1\103\4\0\2\103\3\0\2\103\1\0\1\103\4\0"+
|
"\1\25\3\0\1\25\6\0\2\25\5\0\1\25\1\0"+
|
||||||
"\1\103\1\0\1\103\2\0\2\103\1\0\3\103\1\0"+
|
"\1\25\1\0\1\25\1\0\11\25\2\0\1\25\4\0"+
|
||||||
"\1\103\2\0\4\103\2\0\1\103\42\0\1\116\37\0"+
|
"\1\25\4\0\6\25\2\0\1\25\1\0\1\25\1\0"+
|
||||||
"\1\116\1\0\2\116\16\0\1\116\4\0\1\116\2\0"+
|
"\3\25\1\0\1\54\1\0\2\25\4\0\3\25\1\0"+
|
||||||
"\2\116\10\0\1\52\4\0\1\116\37\0\1\52\102\0"+
|
"\1\25\10\0\1\25\1\0\2\25\21\0\1\25\3\0"+
|
||||||
"\1\52\147\0\2\52\134\0\1\116\153\0\2\116\11\0"+
|
"\1\25\5\0\1\25\32\0\15\25\5\0\3\25\1\0"+
|
||||||
"\1\116\115\0\2\116\6\0\1\116\56\0\1\116\3\0"+
|
"\1\25\5\0\1\25\2\54\5\0\1\25\2\0\1\25"+
|
||||||
"\1\116\2\0\1\116\3\0\1\116\5\0\1\116\7\0"+
|
"\1\54\4\0\1\25\2\0\1\25\1\0\1\25\103\0"+
|
||||||
"\1\116\4\0\2\116\3\0\2\116\1\0\1\116\4\0"+
|
"\2\54\6\0\1\54\56\0\1\54\3\0\1\54\2\0"+
|
||||||
"\1\116\1\0\1\116\2\0\2\116\1\0\3\116\1\0"+
|
"\1\54\3\0\1\54\5\0\1\54\7\0\1\54\4\0"+
|
||||||
"\1\116\2\0\4\116\2\0\1\116\40\0";
|
"\2\54\3\0\2\54\1\0\1\54\4\0\1\54\1\0"+
|
||||||
|
"\1\54\2\0\2\54\1\0\3\54\1\0\1\54\2\0"+
|
||||||
|
"\4\54\2\0\1\54\42\0\1\64\37\0\1\64\1\0"+
|
||||||
|
"\2\64\16\0\1\64\4\0\1\64\2\0\2\64\10\0"+
|
||||||
|
"\1\26\4\0\1\64\37\0\1\26\102\0\1\26\147\0"+
|
||||||
|
"\2\26\134\0\1\64\153\0\2\64\11\0\1\64\115\0"+
|
||||||
|
"\2\64\6\0\1\64\56\0\1\64\3\0\1\64\2\0"+
|
||||||
|
"\1\64\3\0\1\64\5\0\1\64\7\0\1\64\4\0"+
|
||||||
|
"\2\64\3\0\2\64\1\0\1\64\4\0\1\64\1\0"+
|
||||||
|
"\1\64\2\0\2\64\1\0\3\64\1\0\1\64\2\0"+
|
||||||
|
"\4\64\2\0\1\64\42\0\1\65\11\0\3\25\5\0"+
|
||||||
|
"\1\25\1\0\1\25\1\0\1\25\4\0\1\25\4\0"+
|
||||||
|
"\1\65\1\0\2\65\4\0\1\25\5\0\1\25\3\0"+
|
||||||
|
"\1\65\4\0\1\65\2\25\2\65\10\0\1\26\1\0"+
|
||||||
|
"\2\25\1\0\1\65\10\0\1\25\24\0\1\25\3\0"+
|
||||||
|
"\1\25\6\0\2\25\5\0\1\25\1\0\1\25\1\0"+
|
||||||
|
"\1\25\1\0\11\25\2\0\1\25\4\0\1\25\4\0"+
|
||||||
|
"\6\25\2\0\1\25\1\0\1\25\1\0\3\25\1\0"+
|
||||||
|
"\1\65\1\0\2\25\4\0\3\25\1\0\1\25\10\0"+
|
||||||
|
"\1\25\1\0\2\25\21\0\1\25\3\0\1\25\5\0"+
|
||||||
|
"\1\25\32\0\15\25\5\0\3\25\1\0\1\25\5\0"+
|
||||||
|
"\1\25\2\65\5\0\1\25\2\0\1\25\1\65\4\0"+
|
||||||
|
"\1\25\2\0\1\25\1\0\1\25\103\0\2\65\6\0"+
|
||||||
|
"\1\65\56\0\1\65\3\0\1\65\2\0\1\65\3\0"+
|
||||||
|
"\1\65\5\0\1\65\7\0\1\65\4\0\2\65\3\0"+
|
||||||
|
"\2\65\1\0\1\65\4\0\1\65\1\0\1\65\2\0"+
|
||||||
|
"\2\65\1\0\3\65\1\0\1\65\2\0\4\65\2\0"+
|
||||||
|
"\1\65\42\0\1\103\37\0\1\103\1\0\2\103\16\0"+
|
||||||
|
"\1\103\4\0\1\103\2\0\2\103\15\0\1\103\132\0"+
|
||||||
|
"\1\103\153\0\2\103\11\0\1\103\115\0\2\103\6\0"+
|
||||||
|
"\1\103\56\0\1\103\3\0\1\103\2\0\1\103\3\0"+
|
||||||
|
"\1\103\5\0\1\103\7\0\1\103\4\0\2\103\3\0"+
|
||||||
|
"\2\103\1\0\1\103\4\0\1\103\1\0\1\103\2\0"+
|
||||||
|
"\2\103\1\0\3\103\1\0\1\103\2\0\4\103\2\0"+
|
||||||
|
"\1\103\42\0\1\130\37\0\1\130\1\0\2\130\16\0"+
|
||||||
|
"\1\130\4\0\1\130\2\0\2\130\10\0\1\52\4\0"+
|
||||||
|
"\1\130\37\0\1\52\102\0\1\52\147\0\2\52\134\0"+
|
||||||
|
"\1\130\153\0\2\130\11\0\1\130\115\0\2\130\6\0"+
|
||||||
|
"\1\130\56\0\1\130\3\0\1\130\2\0\1\130\3\0"+
|
||||||
|
"\1\130\5\0\1\130\7\0\1\130\4\0\2\130\3\0"+
|
||||||
|
"\2\130\1\0\1\130\4\0\1\130\1\0\1\130\2\0"+
|
||||||
|
"\2\130\1\0\3\130\1\0\1\130\2\0\4\130\2\0"+
|
||||||
|
"\1\130\40\0";
|
||||||
|
|
||||||
private static int [] zzUnpackTrans() {
|
private static int [] zzUnpackTrans() {
|
||||||
int [] result = new int[10609];
|
int [] result = new int[11845];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -621,11 +640,11 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
||||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||||
|
|
||||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||||
"\1\0\1\11\27\1\2\11\1\1\15\0\1\1\1\0"+
|
"\1\0\1\11\32\1\15\0\1\1\1\0\1\1\10\0"+
|
||||||
"\1\1\10\0\1\1\15\0\1\1\57\0";
|
"\1\1\15\0\1\1\71\0";
|
||||||
|
|
||||||
private static int [] zzUnpackAttribute() {
|
private static int [] zzUnpackAttribute() {
|
||||||
int [] result = new int[114];
|
int [] result = new int[124];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -71,6 +71,8 @@ MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||||
|
|
||||||
|
HanEx = {Han} ({Format} | {Extend})*
|
||||||
|
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||||
|
|
||||||
%{
|
%{
|
||||||
/** Alphanumeric sequences */
|
/** Alphanumeric sequences */
|
||||||
|
@ -178,8 +180,8 @@ ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||||
|
|
||||||
// UAX#29 WB14. Any ÷ Any
|
// UAX#29 WB14. Any ÷ Any
|
||||||
//
|
//
|
||||||
{Han} { return IDEOGRAPHIC_TYPE; }
|
{HanEx} { return IDEOGRAPHIC_TYPE; }
|
||||||
{Hiragana} { return HIRAGANA_TYPE; }
|
{HiraganaEx} { return HIRAGANA_TYPE; }
|
||||||
|
|
||||||
|
|
||||||
// UAX#29 WB3. CR × LF
|
// UAX#29 WB3. CR × LF
|
||||||
|
|
|
@ -22,7 +22,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
interface StandardTokenizerInterface {
|
/** @lucene.internal */
|
||||||
|
public interface StandardTokenizerInterface {
|
||||||
|
|
||||||
/** This character denotes the end of file */
|
/** This character denotes the end of file */
|
||||||
public static final int YYEOF = -1;
|
public static final int YYEOF = -1;
|
||||||
|
@ -30,12 +31,12 @@ interface StandardTokenizerInterface {
|
||||||
/**
|
/**
|
||||||
* Copies the matched text into the CharTermAttribute
|
* Copies the matched text into the CharTermAttribute
|
||||||
*/
|
*/
|
||||||
void getText(CharTermAttribute t);
|
public void getText(CharTermAttribute t);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the current position.
|
* Returns the current position.
|
||||||
*/
|
*/
|
||||||
int yychar();
|
public int yychar();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Resets the scanner to read from a new input stream.
|
* Resets the scanner to read from a new input stream.
|
||||||
|
@ -47,12 +48,12 @@ interface StandardTokenizerInterface {
|
||||||
*
|
*
|
||||||
* @param reader the new input stream
|
* @param reader the new input stream
|
||||||
*/
|
*/
|
||||||
void yyreset(Reader reader);
|
public void yyreset(Reader reader);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the length of the matched text region.
|
* Returns the length of the matched text region.
|
||||||
*/
|
*/
|
||||||
int yylength();
|
public int yylength();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Resumes scanning until the next regular expression is matched,
|
* Resumes scanning until the next regular expression is matched,
|
||||||
|
@ -61,6 +62,6 @@ interface StandardTokenizerInterface {
|
||||||
* @return the next token, {@link #YYEOF} on end of stream
|
* @return the next token, {@link #YYEOF} on end of stream
|
||||||
* @exception IOException if any I/O-Error occurs
|
* @exception IOException if any I/O-Error occurs
|
||||||
*/
|
*/
|
||||||
int getNextToken() throws IOException;
|
public int getNextToken() throws IOException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,184 @@
|
||||||
|
package org.apache.lucene.analysis.standard.std31;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class implements StandardTokenizer, except with a bug
|
||||||
|
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
|
||||||
|
* characters would be split from combining characters:
|
||||||
|
* @deprecated This class is only for exact backwards compatibility
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
%%
|
||||||
|
|
||||||
|
%unicode 6.0
|
||||||
|
%integer
|
||||||
|
%final
|
||||||
|
%public
|
||||||
|
%class StandardTokenizerImpl31
|
||||||
|
%implements StandardTokenizerInterface
|
||||||
|
%function getNextToken
|
||||||
|
%char
|
||||||
|
|
||||||
|
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
|
||||||
|
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||||
|
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||||
|
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||||
|
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||||
|
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||||
|
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||||
|
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||||
|
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||||
|
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||||
|
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||||
|
Han = ([\p{Script:Han}] | {HanSupp})
|
||||||
|
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||||
|
|
||||||
|
// Script=Hangul & Aletter
|
||||||
|
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||||
|
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||||
|
//
|
||||||
|
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||||
|
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||||
|
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||||
|
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||||
|
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||||
|
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||||
|
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||||
|
|
||||||
|
|
||||||
|
%{
|
||||||
|
/** Alphanumeric sequences */
|
||||||
|
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
|
||||||
|
|
||||||
|
/** Numbers */
|
||||||
|
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||||
|
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||||
|
* together as as a single token rather than broken up, because the logic
|
||||||
|
* required to break them at word boundaries is too complex for UAX#29.
|
||||||
|
* <p>
|
||||||
|
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||||
|
*/
|
||||||
|
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
|
||||||
|
|
||||||
|
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
|
||||||
|
|
||||||
|
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
|
||||||
|
|
||||||
|
public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
|
||||||
|
|
||||||
|
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
|
||||||
|
|
||||||
|
public final int yychar()
|
||||||
|
{
|
||||||
|
return yychar;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills CharTermAttribute with the current token text.
|
||||||
|
*/
|
||||||
|
public final void getText(CharTermAttribute t) {
|
||||||
|
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
|
}
|
||||||
|
%}
|
||||||
|
|
||||||
|
%%
|
||||||
|
|
||||||
|
// UAX#29 WB1. sot ÷
|
||||||
|
// WB2. ÷ eot
|
||||||
|
//
|
||||||
|
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||||
|
|
||||||
|
// UAX#29 WB8. Numeric × Numeric
|
||||||
|
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||||
|
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||||
|
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
|
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||||
|
//
|
||||||
|
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||||
|
| {MidNumericEx} {NumericEx}
|
||||||
|
| {NumericEx})*
|
||||||
|
{ExtendNumLetEx}*
|
||||||
|
{ return NUMERIC_TYPE; }
|
||||||
|
|
||||||
|
// subset of the below for typing purposes only!
|
||||||
|
{HangulEx}+
|
||||||
|
{ return HANGUL_TYPE; }
|
||||||
|
|
||||||
|
{KatakanaEx}+
|
||||||
|
{ return KATAKANA_TYPE; }
|
||||||
|
|
||||||
|
// UAX#29 WB5. ALetter × ALetter
|
||||||
|
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||||
|
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||||
|
// WB9. ALetter × Numeric
|
||||||
|
// WB10. Numeric × ALetter
|
||||||
|
// WB13. Katakana × Katakana
|
||||||
|
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
|
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||||
|
//
|
||||||
|
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||||
|
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||||
|
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||||
|
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||||
|
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||||
|
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||||
|
{ExtendNumLetEx}*
|
||||||
|
{ return WORD_TYPE; }
|
||||||
|
|
||||||
|
|
||||||
|
// From UAX #29:
|
||||||
|
//
|
||||||
|
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||||
|
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||||
|
// boundary property values based on criteria outside of the scope of this
|
||||||
|
// annex. That means that satisfactory treatment of languages like Chinese
|
||||||
|
// or Thai requires special handling.
|
||||||
|
//
|
||||||
|
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||||
|
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||||
|
//
|
||||||
|
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||||
|
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||||
|
// Lao, etc.) are kept together. This grammar does the same below.
|
||||||
|
//
|
||||||
|
// See also the Unicode Line Breaking Algorithm:
|
||||||
|
//
|
||||||
|
// http://www.unicode.org/reports/tr14/#SA
|
||||||
|
//
|
||||||
|
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||||
|
|
||||||
|
// UAX#29 WB14. Any ÷ Any
|
||||||
|
//
|
||||||
|
{Han} { return IDEOGRAPHIC_TYPE; }
|
||||||
|
{Hiragana} { return HIRAGANA_TYPE; }
|
||||||
|
|
||||||
|
|
||||||
|
// UAX#29 WB3. CR × LF
|
||||||
|
// WB3a. (Newline | CR | LF) ÷
|
||||||
|
// WB3b. ÷ (Newline | CR | LF)
|
||||||
|
// WB14. Any ÷ Any
|
||||||
|
//
|
||||||
|
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -6,6 +6,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -221,6 +222,23 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCombiningMarks() throws Exception {
|
||||||
|
checkOneTerm(a, "ざ", "ざ"); // hiragana
|
||||||
|
checkOneTerm(a, "ザ", "ザ"); // katakana
|
||||||
|
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
|
||||||
|
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
|
||||||
|
@Deprecated
|
||||||
|
public void testCombiningMarksBackwards() throws Exception {
|
||||||
|
Analyzer a = new StandardAnalyzer(Version.LUCENE_33);
|
||||||
|
checkOneTerm(a, "ざ", "さ"); // hiragana Bug
|
||||||
|
checkOneTerm(a, "ザ", "ザ"); // katakana Works
|
||||||
|
checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
|
||||||
|
checkOneTerm(a, "아゙", "아゙"); // hangul Works
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||||
|
|
Loading…
Reference in New Issue