mirror of https://github.com/apache/lucene.git
LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '<EMOJI>' token type.
This commit is contained in:
parent
7db4121b45
commit
283b19a8da
|
@ -241,6 +241,11 @@ Optimizations
|
||||||
|
|
||||||
======================= Lucene 7.7.0 =======================
|
======================= Lucene 7.7.0 =======================
|
||||||
|
|
||||||
|
Changes in Runtime Behavior
|
||||||
|
|
||||||
|
* LUCENE-8527: StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
|
||||||
|
and provide Unicode UTS#51 v11.0 Emoji tokenization with the "<EMOJI>" token type.
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-8611: Update randomizedtesting to 2.7.2, JUnit to 4.12, add hamcrest-core
|
* LUCENE-8611: Update randomizedtesting to 2.7.2, JUnit to 4.12, add hamcrest-core
|
||||||
|
@ -293,6 +298,9 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-8581: Change LatLonShape encoding to use 4 bytes Per Dimension.
|
* LUCENE-8581: Change LatLonShape encoding to use 4 bytes Per Dimension.
|
||||||
(Ignacio Vera, Nick Knize, Adrien Grand)
|
(Ignacio Vera, Nick Knize, Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-8527: Upgrade JFlex dependency to 1.7.0; in StandardTokenizer and UAX29URLEmailTokenizer,
|
||||||
|
increase supported Unicode version from 6.3 to 9.0, and support Unicode UTS#51 v11.0 Emoji tokenization.
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
|
|
|
@ -33,18 +33,14 @@
|
||||||
|
|
||||||
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
|
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
|
||||||
|
|
||||||
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
|
<!-- Because of a bug in JFlex's ant task, HTMLStripCharFilter has to be generated last. -->
|
||||||
-jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
|
<!-- Otherwise the "%apiprivate" option used in its specification will leak into following -->
|
||||||
|
<!-- ant task invocations. -->
|
||||||
|
<target name="jflex" depends="init,clean-jflex,-jflex-wiki-tokenizer,-jflex-ClassicAnalyzer,
|
||||||
|
-jflex-UAX29URLEmailTokenizer,-jflex-HTMLStripCharFilter"/>
|
||||||
|
|
||||||
<target name="-jflex-HTMLStripCharFilter"
|
<target name="-jflex-HTMLStripCharFilter" depends="-install-jflex,generate-jflex-html-char-entities">
|
||||||
depends="init,generate-jflex-html-char-entities">
|
<run-jflex dir="src/java/org/apache/lucene/analysis/charfilter" name="HTMLStripCharFilter"/>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
|
|
||||||
outdir="src/java/org/apache/lucene/analysis/charfilter"
|
|
||||||
nobak="on" inputstreamctor="false"/>
|
|
||||||
<!-- Remove the inappropriate JFlex-generated constructor -->
|
|
||||||
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
|
|
||||||
match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
|
|
||||||
replace="" flags="s"/>
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="generate-jflex-html-char-entities">
|
<target name="generate-jflex-html-char-entities">
|
||||||
|
@ -58,17 +54,17 @@
|
||||||
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
|
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="-jflex-wiki-tokenizer" depends="init,-install-jflex">
|
<target name="-jflex-wiki-tokenizer" depends="-install-jflex">
|
||||||
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
|
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
|
<target name="-jflex-ClassicAnalyzer" depends="-install-jflex">
|
||||||
<run-jflex-and-disable-buffer-expansion
|
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
||||||
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
|
<target name="-jflex-UAX29URLEmailTokenizer" depends="-install-jflex">
|
||||||
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
|
<run-jflex-and-disable-buffer-expansion
|
||||||
|
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="clean-jflex">
|
<target name="clean-jflex">
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||||
@SuppressWarnings("fallthrough")
|
@SuppressWarnings("fallthrough")
|
||||||
%%
|
%%
|
||||||
|
|
||||||
%unicode 6.3
|
%unicode 9.0
|
||||||
%apiprivate
|
%apiprivate
|
||||||
%type int
|
%type int
|
||||||
%final
|
%final
|
||||||
|
@ -50,6 +50,10 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||||
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
|
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
|
||||||
%xstate STYLE, STYLE_COMMENT
|
%xstate STYLE, STYLE_COMMENT
|
||||||
|
|
||||||
|
%init{
|
||||||
|
super(in);
|
||||||
|
%init}
|
||||||
|
|
||||||
// From XML 1.0 <http://www.w3.org/TR/xml/>:
|
// From XML 1.0 <http://www.w3.org/TR/xml/>:
|
||||||
//
|
//
|
||||||
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
|
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
|
||||||
|
@ -165,25 +169,15 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
private TextSegment outputSegment = inputSegment;
|
private TextSegment outputSegment = inputSegment;
|
||||||
private TextSegment entitySegment = new TextSegment(2);
|
private TextSegment entitySegment = new TextSegment(2);
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new HTMLStripCharFilter over the provided Reader.
|
|
||||||
* @param source Reader to strip html tags from.
|
|
||||||
*/
|
|
||||||
public HTMLStripCharFilter(Reader source) {
|
|
||||||
super(source);
|
|
||||||
this.zzReader = source;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new HTMLStripCharFilter over the provided Reader
|
* Creates a new HTMLStripCharFilter over the provided Reader
|
||||||
* with the specified start and end tags.
|
* with the specified start and end tags.
|
||||||
* @param source Reader to strip html tags from.
|
* @param in Reader to strip html tags from.
|
||||||
* @param escapedTags Tags in this set (both start and end tags)
|
* @param escapedTags Tags in this set (both start and end tags)
|
||||||
* will not be filtered out.
|
* will not be filtered out.
|
||||||
*/
|
*/
|
||||||
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
|
public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
|
||||||
super(source);
|
this(in);
|
||||||
this.zzReader = source;
|
|
||||||
if (null != escapedTags) {
|
if (null != escapedTags) {
|
||||||
for (String tag : escapedTags) {
|
for (String tag : escapedTags) {
|
||||||
if (tag.equalsIgnoreCase("BR")) {
|
if (tag.equalsIgnoreCase("BR")) {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.6.0 */
|
/* The following code was generated by JFlex 1.7.0 */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -251,7 +251,7 @@ class ClassicTokenizerImpl {
|
||||||
|
|
||||||
/* error messages for the codes above */
|
/* error messages for the codes above */
|
||||||
private static final String ZZ_ERROR_MSG[] = {
|
private static final String ZZ_ERROR_MSG[] = {
|
||||||
"Unkown internal scanner error",
|
"Unknown internal scanner error",
|
||||||
"Error: could not match input",
|
"Error: could not match input",
|
||||||
"Error: pushback value was too large"
|
"Error: pushback value was too large"
|
||||||
};
|
};
|
||||||
|
@ -323,11 +323,11 @@ class ClassicTokenizerImpl {
|
||||||
private int yycolumn;
|
private int yycolumn;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
* zzAtBOL == true iff the scanner is currently at the beginning of a line
|
||||||
*/
|
*/
|
||||||
private boolean zzAtBOL = true;
|
private boolean zzAtBOL = true;
|
||||||
|
|
||||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
/** zzAtEOF == true iff the scanner is at the EOF */
|
||||||
private boolean zzAtEOF;
|
private boolean zzAtEOF;
|
||||||
|
|
||||||
/** denotes if the user-EOF-code has already been executed */
|
/** denotes if the user-EOF-code has already been executed */
|
||||||
|
@ -436,28 +436,29 @@ public final void getText(CharTermAttribute t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fill the buffer with new input */
|
/* fill the buffer with new input */
|
||||||
int requested = zzBuffer.length - zzEndRead;
|
int requested = zzBuffer.length - zzEndRead;
|
||||||
int totalRead = 0;
|
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
|
||||||
while (totalRead < requested) {
|
|
||||||
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
|
|
||||||
if (numRead == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
totalRead += numRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (totalRead > 0) {
|
/* not supposed to occur according to specification of java.io.Reader */
|
||||||
zzEndRead += totalRead;
|
if (numRead == 0) {
|
||||||
if (totalRead == requested) { /* possibly more input available */
|
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
|
||||||
|
}
|
||||||
|
if (numRead > 0) {
|
||||||
|
zzEndRead += numRead;
|
||||||
|
/* If numRead == requested, we might have requested to few chars to
|
||||||
|
encode a full Unicode character. We assume that a Reader would
|
||||||
|
otherwise never return half characters. */
|
||||||
|
if (numRead == requested) {
|
||||||
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||||
--zzEndRead;
|
--zzEndRead;
|
||||||
zzFinalHighSurrogate = 1;
|
zzFinalHighSurrogate = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* potentially more input available */
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// totalRead = 0: End of stream
|
/* numRead < 0 ==> end of stream */
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -681,55 +682,65 @@ public final void getText(CharTermAttribute t) {
|
||||||
// store back cached position
|
// store back cached position
|
||||||
zzMarkedPos = zzMarkedPosL;
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
case 1:
|
zzAtEOF = true;
|
||||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
return YYEOF;
|
||||||
}
|
}
|
||||||
case 11: break;
|
else {
|
||||||
case 2:
|
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||||
{ return ALPHANUM;
|
case 1:
|
||||||
}
|
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
||||||
case 12: break;
|
}
|
||||||
case 3:
|
// fall through
|
||||||
{ return CJ;
|
case 11: break;
|
||||||
}
|
case 2:
|
||||||
case 13: break;
|
{ return ALPHANUM;
|
||||||
case 4:
|
}
|
||||||
{ return HOST;
|
// fall through
|
||||||
}
|
case 12: break;
|
||||||
case 14: break;
|
case 3:
|
||||||
case 5:
|
{ return CJ;
|
||||||
{ return NUM;
|
}
|
||||||
}
|
// fall through
|
||||||
case 15: break;
|
case 13: break;
|
||||||
case 6:
|
case 4:
|
||||||
{ return APOSTROPHE;
|
{ return HOST;
|
||||||
}
|
}
|
||||||
case 16: break;
|
// fall through
|
||||||
case 7:
|
case 14: break;
|
||||||
{ return COMPANY;
|
case 5:
|
||||||
}
|
{ return NUM;
|
||||||
case 17: break;
|
}
|
||||||
case 8:
|
// fall through
|
||||||
{ return ACRONYM_DEP;
|
case 15: break;
|
||||||
}
|
case 6:
|
||||||
case 18: break;
|
{ return APOSTROPHE;
|
||||||
case 9:
|
}
|
||||||
{ return ACRONYM;
|
// fall through
|
||||||
}
|
case 16: break;
|
||||||
case 19: break;
|
case 7:
|
||||||
case 10:
|
{ return COMPANY;
|
||||||
{ return EMAIL;
|
}
|
||||||
}
|
// fall through
|
||||||
case 20: break;
|
case 17: break;
|
||||||
default:
|
case 8:
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
{ return ACRONYM_DEP;
|
||||||
zzAtEOF = true;
|
}
|
||||||
return YYEOF;
|
// fall through
|
||||||
}
|
case 18: break;
|
||||||
else {
|
case 9:
|
||||||
|
{ return ACRONYM;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 19: break;
|
||||||
|
case 10:
|
||||||
|
{ return EMAIL;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 20: break;
|
||||||
|
default:
|
||||||
zzScanError(ZZ_NO_MATCH);
|
zzScanError(ZZ_NO_MATCH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,33 +32,32 @@ import org.apache.lucene.util.AttributeFactory;
|
||||||
* algorithm, as specified in
|
* algorithm, as specified in
|
||||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||||
* <p>
|
|
||||||
* Tokens produced are of the following types:
|
|
||||||
* <ul>
|
|
||||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
|
||||||
* <li><NUM>: A number</li>
|
|
||||||
* <li><URL>: A URL</li>
|
|
||||||
* <li><EMAIL>: An email address</li>
|
|
||||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
|
||||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
|
||||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
|
||||||
* <li><HIRAGANA>: A single hiragana character</li>
|
|
||||||
* </ul>
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public final class UAX29URLEmailTokenizer extends Tokenizer {
|
public final class UAX29URLEmailTokenizer extends Tokenizer {
|
||||||
/** A private instance of the JFlex-constructed scanner */
|
/** A private instance of the JFlex-constructed scanner */
|
||||||
private final UAX29URLEmailTokenizerImpl scanner;
|
private final UAX29URLEmailTokenizerImpl scanner;
|
||||||
|
|
||||||
public static final int ALPHANUM = 0;
|
/** Alpha/numeric token type */
|
||||||
public static final int NUM = 1;
|
public static final int ALPHANUM = 0;
|
||||||
public static final int SOUTHEAST_ASIAN = 2;
|
/** Numeric token type */
|
||||||
public static final int IDEOGRAPHIC = 3;
|
public static final int NUM = 1;
|
||||||
public static final int HIRAGANA = 4;
|
/** Southeast Asian token type */
|
||||||
public static final int KATAKANA = 5;
|
public static final int SOUTHEAST_ASIAN = 2;
|
||||||
public static final int HANGUL = 6;
|
/** Ideographic token type */
|
||||||
public static final int URL = 7;
|
public static final int IDEOGRAPHIC = 3;
|
||||||
public static final int EMAIL = 8;
|
/** Hiragana token type */
|
||||||
|
public static final int HIRAGANA = 4;
|
||||||
|
/** Katakana token type */
|
||||||
|
public static final int KATAKANA = 5;
|
||||||
|
/** Hangul token type */
|
||||||
|
public static final int HANGUL = 6;
|
||||||
|
/** URL token type */
|
||||||
|
public static final int URL = 7;
|
||||||
|
/** Email token type */
|
||||||
|
public static final int EMAIL = 8;
|
||||||
|
/** Emoji token type. */
|
||||||
|
public static final int EMOJI = 9;
|
||||||
|
|
||||||
/** String token types that correspond to token type int constants */
|
/** String token types that correspond to token type int constants */
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
|
@ -71,6 +70,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
||||||
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
|
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
|
||||||
"<URL>",
|
"<URL>",
|
||||||
"<EMAIL>",
|
"<EMAIL>",
|
||||||
|
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Absolute maximum sized token */
|
/** Absolute maximum sized token */
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -37,12 +37,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
* <li><HIRAGANA>: A single hiragana character</li>
|
* <li><HIRAGANA>: A single hiragana character</li>
|
||||||
* <li><KATAKANA>: A sequence of katakana characters</li>
|
* <li><KATAKANA>: A sequence of katakana characters</li>
|
||||||
* <li><HANGUL>: A sequence of Hangul characters</li>
|
* <li><HANGUL>: A sequence of Hangul characters</li>
|
||||||
|
* <li><EMOJI>: A sequence of Emoji characters</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("fallthrough")
|
@SuppressWarnings("fallthrough")
|
||||||
%%
|
%%
|
||||||
|
|
||||||
%unicode 6.3
|
%unicode 9.0
|
||||||
%integer
|
%integer
|
||||||
%final
|
%final
|
||||||
%public
|
%public
|
||||||
|
@ -52,22 +53,73 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
%xstate AVOID_BAD_URL
|
%xstate AVOID_BAD_URL
|
||||||
%buffer 255
|
%buffer 255
|
||||||
|
|
||||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
|
||||||
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
//
|
//
|
||||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
|
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
|
||||||
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
|
//////////////////////////////////////////////////////////////////////////
|
||||||
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
|
||||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
|
||||||
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
|
%include ../../../../../../../../../core/src/data/jflex/UnicodeEmojiProperties.jflex
|
||||||
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
//
|
||||||
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
|
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
||||||
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
|
// - are explicitly excluded here so that we can properly handle Emoji sequences.
|
||||||
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
|
//
|
||||||
|
ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
|
||||||
|
|
||||||
|
KeyCapBaseChar = [0-9#*]
|
||||||
|
KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
|
||||||
|
KeyCap = \u20E3
|
||||||
|
KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
|
||||||
|
|
||||||
|
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
||||||
|
AccidentalEmoji = [©®™\u3030\u303D]
|
||||||
|
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
|
||||||
|
|
||||||
|
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
|
||||||
|
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
|
||||||
|
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
|
||||||
|
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
|
||||||
|
|
||||||
|
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
|
||||||
|
|
||||||
|
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||||
|
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||||
|
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||||
|
|
||||||
|
EmojiPresentationSelector = \uFE0F
|
||||||
|
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
||||||
|
TagSpec = [\u{E0020}-\u{E007E}]
|
||||||
|
TagTerm = \u{E007F}
|
||||||
|
|
||||||
|
// End Emoji Macros
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
|
//
|
||||||
|
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
|
||||||
|
|
||||||
|
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
|
||||||
|
AHLetterEx = [\p{WB:ALetter}\p{WB:Hebrew_Letter}] {ExtFmtZwj}
|
||||||
|
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] {ExtFmtZwj}
|
||||||
|
KatakanaEx = \p{WB:Katakana} {ExtFmtZwj}
|
||||||
|
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
|
||||||
|
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
|
||||||
|
ExtendNumLetEx = \p{WB:ExtendNumLet} {ExtFmtZwj}
|
||||||
|
HanEx = \p{Script:Han} {ExtFmtZwj}
|
||||||
|
HiraganaEx = \p{Script:Hiragana} {ExtFmtZwj}
|
||||||
|
SingleQuoteEx = \p{WB:Single_Quote} {ExtFmtZwj}
|
||||||
|
DoubleQuoteEx = \p{WB:Double_Quote} {ExtFmtZwj}
|
||||||
|
HebrewLetterEx = \p{WB:Hebrew_Letter} {ExtFmtZwj}
|
||||||
|
RegionalIndicatorEx = \p{WB:Regional_Indicator} {ExtFmtZwj}
|
||||||
|
ComplexContextEx = \p{LB:Complex_Context} {ExtFmtZwj}
|
||||||
|
|
||||||
|
|
||||||
// URL and E-mail syntax specifications:
|
// URL and E-mail syntax specifications:
|
||||||
//
|
//
|
||||||
|
@ -174,18 +226,28 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
*/
|
*/
|
||||||
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
|
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
|
||||||
|
|
||||||
|
/** Ideographic token type */
|
||||||
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
|
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
|
||||||
|
|
||||||
|
/** Hiragana token type */
|
||||||
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
|
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
|
||||||
|
|
||||||
|
/** Katakana token type */
|
||||||
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
|
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
|
||||||
|
|
||||||
|
/** Hangul token type */
|
||||||
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
|
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
|
||||||
|
|
||||||
|
/** Email token type */
|
||||||
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
|
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
|
||||||
|
|
||||||
|
/** URL token type */
|
||||||
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
|
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
|
||||||
|
|
||||||
|
/** Emoji token type */
|
||||||
|
public static final int EMOJI_TYPE = UAX29URLEmailTokenizer.EMOJI;
|
||||||
|
|
||||||
|
/** Character count processed so far */
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
{
|
{
|
||||||
return yychar;
|
return yychar;
|
||||||
|
@ -213,11 +275,11 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
|
|
||||||
<YYINITIAL, AVOID_BAD_URL> {
|
<YYINITIAL, AVOID_BAD_URL> {
|
||||||
|
|
||||||
// UAX#29 WB1. sot ÷
|
// UAX#29 WB1. sot ÷ Any
|
||||||
// WB2. ÷ eot
|
// WB2. Any ÷ eot
|
||||||
//
|
//
|
||||||
<<EOF>> { return YYEOF; }
|
<<EOF>> { return YYEOF; }
|
||||||
|
|
||||||
{URL} { yybegin(YYINITIAL); return URL_TYPE; }
|
{URL} { yybegin(YYINITIAL); return URL_TYPE; }
|
||||||
|
|
||||||
// LUCENE-5391: Don't recognize no-scheme domain-only URLs with a following alphanumeric character
|
// LUCENE-5391: Don't recognize no-scheme domain-only URLs with a following alphanumeric character
|
||||||
|
@ -244,14 +306,61 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
|
|
||||||
{EMAIL} { yybegin(YYINITIAL); return EMAIL_TYPE; }
|
{EMAIL} { yybegin(YYINITIAL); return EMAIL_TYPE; }
|
||||||
|
|
||||||
// UAX#29 WB8. Numeric × Numeric
|
|
||||||
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
|
||||||
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
// WB14. (E_Base | EBG) × E_Modifier
|
||||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
// WB15. ^ (RI RI)* RI × RI
|
||||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
// WB16. [^RI] (RI RI)* RI × RI
|
||||||
|
//
|
||||||
|
// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
|
||||||
|
// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
|
||||||
|
//
|
||||||
|
// emoji_sequence :=
|
||||||
|
// Top-level EBNF Expanded #1 Expanded #2 Expanded #3
|
||||||
|
// --------------------- ---------------------------- ----------------------------- ----------------------------------------------
|
||||||
|
// emoji_core_sequence emoji_combining_sequence emoji_character ( \p{Emoji}
|
||||||
|
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
|
||||||
|
// | emoji_keycap_sequence | [0-9#*] \u{FE0F 20E3} [1]
|
||||||
|
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
|
||||||
|
// | emoji_flag_sequence | \p{WB:Regional_Indicator}{2} )
|
||||||
|
//
|
||||||
|
// | emoji_zwj_sequence emoji_zwj_element emoji_character ( \p{Emoji}
|
||||||
|
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
|
||||||
|
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
|
||||||
|
// ( ZWJ emoji_zwj_element )+ ( \p{WB:ZWJ} ^^ )+
|
||||||
|
//
|
||||||
|
// | emoji_tag_sequence tag_base emoji_character ( \p{Emoji}
|
||||||
|
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
|
||||||
|
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
|
||||||
|
// tag_spec [\u{E0020}-\u{E007E}]+
|
||||||
|
// tag_term \u{E007F}
|
||||||
|
//
|
||||||
|
// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences
|
||||||
|
// WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
|
||||||
|
// TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
|
||||||
|
// choose whether to support them for segmentation. This implementation will
|
||||||
|
// recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji.
|
||||||
|
//
|
||||||
|
// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
|
||||||
|
// https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
|
||||||
|
//
|
||||||
|
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
|
||||||
|
//
|
||||||
|
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
|
||||||
|
//
|
||||||
|
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
|
||||||
|
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
|
||||||
|
| {RegionalIndicatorEx}{2}
|
||||||
|
{ yybegin(YYINITIAL); return EMOJI_TYPE; }
|
||||||
|
|
||||||
|
// UAX#29 WB8. Numeric × Numeric
|
||||||
|
// WB11. Numeric (MidNum | MidNumLetQ) × Numeric
|
||||||
|
// WB12. Numeric × (MidNum | MidNumLetQ) Numeric
|
||||||
|
// WB13a. (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
|
// WB13b. ExtendNumLet × (AHLetter | Numeric | Katakana)
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
|
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
|
||||||
{ yybegin(YYINITIAL); return NUMERIC_TYPE; }
|
{ yybegin(YYINITIAL); return NUMERIC_TYPE; }
|
||||||
|
|
||||||
// subset of the below for typing purposes only!
|
// subset of the below for typing purposes only!
|
||||||
{HangulEx}+
|
{HangulEx}+
|
||||||
|
@ -260,32 +369,32 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
{KatakanaEx}+
|
{KatakanaEx}+
|
||||||
{ yybegin(YYINITIAL); return KATAKANA_TYPE; }
|
{ yybegin(YYINITIAL); return KATAKANA_TYPE; }
|
||||||
|
|
||||||
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
// UAX#29 WB5. AHLetter × AHLetter
|
||||||
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
// WB6. AHLetter × (MidLetter | MidNumLetQ) AHLetter
|
||||||
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
// WB7. AHLetter (MidLetter | MidNumLetQ) × AHLetter
|
||||||
// WB7a. Hebrew_Letter × Single_Quote
|
// WB7a. Hebrew_Letter × Single_Quote
|
||||||
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
||||||
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
||||||
// WB9. (ALetter | Hebrew_Letter) × Numeric
|
// WB9. AHLetter × Numeric
|
||||||
// WB10. Numeric × (ALetter | Hebrew_Letter)
|
// WB10. Numeric × AHLetter
|
||||||
// WB13. Katakana × Katakana
|
// WB13. Katakana × Katakana
|
||||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
)*
|
)*
|
||||||
{ExtendNumLetEx}*
|
{ExtendNumLetEx}*
|
||||||
{ yybegin(YYINITIAL); return WORD_TYPE; }
|
{ yybegin(YYINITIAL); return WORD_TYPE; }
|
||||||
|
|
||||||
|
|
||||||
|
@ -297,7 +406,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
// annex. That means that satisfactory treatment of languages like Chinese
|
// annex. That means that satisfactory treatment of languages like Chinese
|
||||||
// or Thai requires special handling.
|
// or Thai requires special handling.
|
||||||
//
|
//
|
||||||
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
|
// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||||
//
|
//
|
||||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||||
|
@ -310,18 +419,15 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
//
|
//
|
||||||
{ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
|
{ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
|
||||||
|
|
||||||
// UAX#29 WB14. Any ÷ Any
|
// UAX#29 WB999. Any ÷ Any
|
||||||
//
|
//
|
||||||
{HanEx} { yybegin(YYINITIAL); return IDEOGRAPHIC_TYPE; }
|
{HanEx} { yybegin(YYINITIAL); return IDEOGRAPHIC_TYPE; }
|
||||||
{HiraganaEx} { yybegin(YYINITIAL); return HIRAGANA_TYPE; }
|
{HiraganaEx} { yybegin(YYINITIAL); return HIRAGANA_TYPE; }
|
||||||
|
|
||||||
|
// UAX#29 WB3. CR × LF
|
||||||
// UAX#29 WB3. CR × LF
|
// WB3a. (Newline | CR | LF) ÷
|
||||||
// WB3a. (Newline | CR | LF) ÷
|
// WB3b. ÷ (Newline | CR | LF)
|
||||||
// WB3b. ÷ (Newline | CR | LF)
|
// WB999. Any ÷ Any
|
||||||
// WB13c. Regional_Indicator × Regional_Indicator
|
|
||||||
// WB14. Any ÷ Any
|
|
||||||
//
|
//
|
||||||
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
|
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
|
||||||
{ yybegin(YYINITIAL); /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
||||||
}
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.6.0 */
|
/* The following code was generated by JFlex 1.7.0 */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -341,7 +341,7 @@ class WikipediaTokenizerImpl {
|
||||||
|
|
||||||
/* error messages for the codes above */
|
/* error messages for the codes above */
|
||||||
private static final String ZZ_ERROR_MSG[] = {
|
private static final String ZZ_ERROR_MSG[] = {
|
||||||
"Unkown internal scanner error",
|
"Unknown internal scanner error",
|
||||||
"Error: could not match input",
|
"Error: could not match input",
|
||||||
"Error: pushback value was too large"
|
"Error: pushback value was too large"
|
||||||
};
|
};
|
||||||
|
@ -419,11 +419,11 @@ class WikipediaTokenizerImpl {
|
||||||
private int yycolumn;
|
private int yycolumn;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
* zzAtBOL == true iff the scanner is currently at the beginning of a line
|
||||||
*/
|
*/
|
||||||
private boolean zzAtBOL = true;
|
private boolean zzAtBOL = true;
|
||||||
|
|
||||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
/** zzAtEOF == true iff the scanner is at the EOF */
|
||||||
private boolean zzAtEOF;
|
private boolean zzAtEOF;
|
||||||
|
|
||||||
/** denotes if the user-EOF-code has already been executed */
|
/** denotes if the user-EOF-code has already been executed */
|
||||||
|
@ -575,28 +575,29 @@ final void reset() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fill the buffer with new input */
|
/* fill the buffer with new input */
|
||||||
int requested = zzBuffer.length - zzEndRead;
|
int requested = zzBuffer.length - zzEndRead;
|
||||||
int totalRead = 0;
|
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
|
||||||
while (totalRead < requested) {
|
|
||||||
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
|
|
||||||
if (numRead == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
totalRead += numRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (totalRead > 0) {
|
/* not supposed to occur according to specification of java.io.Reader */
|
||||||
zzEndRead += totalRead;
|
if (numRead == 0) {
|
||||||
if (totalRead == requested) { /* possibly more input available */
|
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
|
||||||
|
}
|
||||||
|
if (numRead > 0) {
|
||||||
|
zzEndRead += numRead;
|
||||||
|
/* If numRead == requested, we might have requested to few chars to
|
||||||
|
encode a full Unicode character. We assume that a Reader would
|
||||||
|
otherwise never return half characters. */
|
||||||
|
if (numRead == requested) {
|
||||||
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||||
--zzEndRead;
|
--zzEndRead;
|
||||||
zzFinalHighSurrogate = 1;
|
zzFinalHighSurrogate = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* potentially more input available */
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// totalRead = 0: End of stream
|
/* numRead < 0 ==> end of stream */
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -820,199 +821,245 @@ final void reset() {
|
||||||
// store back cached position
|
// store back cached position
|
||||||
zzMarkedPos = zzMarkedPosL;
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
case 1:
|
zzAtEOF = true;
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
return YYEOF;
|
||||||
}
|
}
|
||||||
case 47: break;
|
else {
|
||||||
case 2:
|
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||||
{ positionInc = 1; return ALPHANUM;
|
case 1:
|
||||||
}
|
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
||||||
case 48: break;
|
}
|
||||||
case 3:
|
// fall through
|
||||||
{ positionInc = 1; return CJ;
|
case 47: break;
|
||||||
}
|
case 2:
|
||||||
case 49: break;
|
{ positionInc = 1; return ALPHANUM;
|
||||||
case 4:
|
}
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
// fall through
|
||||||
}
|
case 48: break;
|
||||||
case 50: break;
|
case 3:
|
||||||
case 5:
|
{ positionInc = 1; return CJ;
|
||||||
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 51: break;
|
case 49: break;
|
||||||
case 6:
|
case 4:
|
||||||
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||||
}
|
}
|
||||||
case 52: break;
|
// fall through
|
||||||
case 7:
|
case 50: break;
|
||||||
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
case 5:
|
||||||
}
|
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
||||||
case 53: break;
|
}
|
||||||
case 8:
|
// fall through
|
||||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
case 51: break;
|
||||||
}
|
case 6:
|
||||||
case 54: break;
|
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
case 9:
|
}
|
||||||
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
// fall through
|
||||||
}
|
case 52: break;
|
||||||
case 55: break;
|
case 7:
|
||||||
case 10:
|
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 56: break;
|
case 53: break;
|
||||||
case 11:
|
case 8:
|
||||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
||||||
}
|
}
|
||||||
case 57: break;
|
// fall through
|
||||||
case 12:
|
case 54: break;
|
||||||
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
case 9:
|
||||||
}
|
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||||
case 58: break;
|
}
|
||||||
case 13:
|
// fall through
|
||||||
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
case 55: break;
|
||||||
}
|
case 10:
|
||||||
case 59: break;
|
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||||
case 14:
|
}
|
||||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
// fall through
|
||||||
}
|
case 56: break;
|
||||||
case 60: break;
|
case 11:
|
||||||
case 15:
|
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 61: break;
|
case 57: break;
|
||||||
case 16:
|
case 12:
|
||||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
||||||
}
|
}
|
||||||
case 62: break;
|
// fall through
|
||||||
case 17:
|
case 58: break;
|
||||||
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
|
case 13:
|
||||||
}
|
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
case 63: break;
|
}
|
||||||
case 18:
|
// fall through
|
||||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
|
case 59: break;
|
||||||
}
|
case 14:
|
||||||
case 64: break;
|
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
||||||
case 19:
|
}
|
||||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
|
// fall through
|
||||||
}
|
case 60: break;
|
||||||
case 65: break;
|
case 15:
|
||||||
case 20:
|
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
|
||||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 66: break;
|
case 61: break;
|
||||||
case 21:
|
case 16:
|
||||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
}
|
}
|
||||||
case 67: break;
|
// fall through
|
||||||
case 22:
|
case 62: break;
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
|
case 17:
|
||||||
}
|
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
|
||||||
case 68: break;
|
}
|
||||||
case 23:
|
// fall through
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
|
case 63: break;
|
||||||
}
|
case 18:
|
||||||
case 69: break;
|
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
|
||||||
case 24:
|
}
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
// fall through
|
||||||
}
|
case 64: break;
|
||||||
case 70: break;
|
case 19:
|
||||||
case 25:
|
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 71: break;
|
case 65: break;
|
||||||
case 26:
|
case 20:
|
||||||
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
}
|
}
|
||||||
case 72: break;
|
// fall through
|
||||||
case 27:
|
case 66: break;
|
||||||
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
case 21:
|
||||||
}
|
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||||
case 73: break;
|
}
|
||||||
case 28:
|
// fall through
|
||||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
case 67: break;
|
||||||
}
|
case 22:
|
||||||
case 74: break;
|
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
|
||||||
case 29:
|
}
|
||||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
// fall through
|
||||||
}
|
case 68: break;
|
||||||
case 75: break;
|
case 23:
|
||||||
case 30:
|
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||||
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 76: break;
|
case 69: break;
|
||||||
case 31:
|
case 24:
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||||
}
|
}
|
||||||
case 77: break;
|
// fall through
|
||||||
case 32:
|
case 70: break;
|
||||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
case 25:
|
||||||
}
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||||
case 78: break;
|
}
|
||||||
case 33:
|
// fall through
|
||||||
{ positionInc = 1; return APOSTROPHE;
|
case 71: break;
|
||||||
}
|
case 26:
|
||||||
case 79: break;
|
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
|
||||||
case 34:
|
}
|
||||||
{ positionInc = 1; return HOST;
|
// fall through
|
||||||
}
|
case 72: break;
|
||||||
case 80: break;
|
case 27:
|
||||||
case 35:
|
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||||
{ positionInc = 1; return NUM;
|
}
|
||||||
}
|
// fall through
|
||||||
case 81: break;
|
case 73: break;
|
||||||
case 36:
|
case 28:
|
||||||
{ positionInc = 1; return COMPANY;
|
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
}
|
}
|
||||||
case 82: break;
|
// fall through
|
||||||
case 37:
|
case 74: break;
|
||||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
case 29:
|
||||||
}
|
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
case 83: break;
|
}
|
||||||
case 38:
|
// fall through
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
|
case 75: break;
|
||||||
}
|
case 30:
|
||||||
case 84: break;
|
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||||
case 39:
|
}
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
|
// fall through
|
||||||
}
|
case 76: break;
|
||||||
case 85: break;
|
case 31:
|
||||||
case 40:
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
|
||||||
{ positionInc = 1; return ACRONYM;
|
}
|
||||||
}
|
// fall through
|
||||||
case 86: break;
|
case 77: break;
|
||||||
case 41:
|
case 32:
|
||||||
{ positionInc = 1; return EMAIL;
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
}
|
}
|
||||||
case 87: break;
|
// fall through
|
||||||
case 42:
|
case 78: break;
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
|
case 33:
|
||||||
}
|
{ positionInc = 1; return APOSTROPHE;
|
||||||
case 88: break;
|
}
|
||||||
case 43:
|
// fall through
|
||||||
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
case 79: break;
|
||||||
}
|
case 34:
|
||||||
case 89: break;
|
{ positionInc = 1; return HOST;
|
||||||
case 44:
|
}
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
|
// fall through
|
||||||
}
|
case 80: break;
|
||||||
case 90: break;
|
case 35:
|
||||||
case 45:
|
{ positionInc = 1; return NUM;
|
||||||
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
}
|
||||||
}
|
// fall through
|
||||||
case 91: break;
|
case 81: break;
|
||||||
case 46:
|
case 36:
|
||||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
{ positionInc = 1; return COMPANY;
|
||||||
}
|
}
|
||||||
case 92: break;
|
// fall through
|
||||||
default:
|
case 82: break;
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
case 37:
|
||||||
zzAtEOF = true;
|
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
return YYEOF;
|
}
|
||||||
}
|
// fall through
|
||||||
else {
|
case 83: break;
|
||||||
|
case 38:
|
||||||
|
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 84: break;
|
||||||
|
case 39:
|
||||||
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 85: break;
|
||||||
|
case 40:
|
||||||
|
{ positionInc = 1; return ACRONYM;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 86: break;
|
||||||
|
case 41:
|
||||||
|
{ positionInc = 1; return EMAIL;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 87: break;
|
||||||
|
case 42:
|
||||||
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 88: break;
|
||||||
|
case 43:
|
||||||
|
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 89: break;
|
||||||
|
case 44:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 90: break;
|
||||||
|
case 45:
|
||||||
|
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 91: break;
|
||||||
|
case 46:
|
||||||
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 92: break;
|
||||||
|
default:
|
||||||
zzScanError(ZZ_NO_MATCH);
|
zzScanError(ZZ_NO_MATCH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -499,7 +499,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
|
String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
|
||||||
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
|
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
|
||||||
String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
|
String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString2 +"-[CDATA[";
|
||||||
|
|
||||||
String[] testGold = {
|
String[] testGold = {
|
||||||
"one<![CDATA[<one><two>three<four></four></two></one>]]>two",
|
"one<![CDATA[<one><two>three<four></four></two></one>]]>two",
|
||||||
|
|
|
@ -361,14 +361,14 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
StringBuilder bToken = new StringBuilder();
|
StringBuilder bToken = new StringBuilder();
|
||||||
// exact max length:
|
// exact max length:
|
||||||
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
|
for(int i=0;i<UAX29URLEmailAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
|
||||||
bToken.append('b');
|
bToken.append('b');
|
||||||
}
|
}
|
||||||
|
|
||||||
String bString = bToken.toString();
|
String bString = bToken.toString();
|
||||||
// first bString is exact max default length; next one is 1 too long
|
// first bString is exact max default length; next one is 1 too long
|
||||||
String input = "x " + bString + " " + bString + "b";
|
String input = "x " + bString + " " + bString + "b";
|
||||||
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
|
assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"});
|
||||||
a.close();
|
a.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -467,7 +467,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnicodeWordBreaks() throws Exception {
|
public void testUnicodeWordBreaks() throws Exception {
|
||||||
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
|
WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
|
||||||
wordBreakTest.test(a);
|
wordBreakTest.test(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -545,6 +545,80 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** simple emoji */
|
||||||
|
public void testEmoji() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
|
||||||
|
new String[] { "💩", "💩", "💩" },
|
||||||
|
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** emoji zwj sequence */
|
||||||
|
public void testEmojiSequence() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
|
||||||
|
new String[] { "👩❤️👩" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** emoji zwj sequence with fitzpatrick modifier */
|
||||||
|
public void testEmojiSequenceWithModifier() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
|
||||||
|
new String[] { "👨🏼⚕️" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** regional indicator */
|
||||||
|
public void testEmojiRegionalIndicator() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
|
||||||
|
new String[] { "🇺🇸", "🇺🇸" },
|
||||||
|
new String[] { "<EMOJI>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** variation sequence */
|
||||||
|
public void testEmojiVariationSequence() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
|
||||||
|
new String[] { "#️⃣" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
|
||||||
|
new String[] { "3️⃣",},
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
|
||||||
|
// text presentation sequences
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
|
||||||
|
new String[] { },
|
||||||
|
new String[] { });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
|
||||||
|
new String[] { "3\uFE0E",},
|
||||||
|
new String[] { "<NUM>" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
|
||||||
|
new String[] { "\u2B55",},
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
|
||||||
|
new String[] { "\u2B55", "\u200D\u2B55"},
|
||||||
|
new String[] { "<EMOJI>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmojiTagSequence() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
|
||||||
|
new String[] { "🏴" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmojiTokenization() throws Exception {
|
||||||
|
// simple emoji around latin
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
|
||||||
|
new String[] { "poo", "💩", "poo" },
|
||||||
|
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
|
||||||
|
// simple emoji around non-latin
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
|
||||||
|
new String[] { "💩", "中", "國", "💩" },
|
||||||
|
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUnicodeEmojiTests() throws Exception {
|
||||||
|
EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
|
||||||
|
emojiTest.test(a);
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -2388,7 +2388,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
||||||
|
|
||||||
<!-- JFlex task -->
|
<!-- JFlex task -->
|
||||||
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
|
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
|
||||||
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
|
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.7.0"
|
||||||
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
|
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
|
||||||
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
|
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
|
||||||
<property name="jflex.loaded" value="true"/>
|
<property name="jflex.loaded" value="true"/>
|
||||||
|
@ -2645,7 +2645,11 @@ The following arguments can be provided to ant to alter its behaviour and target
|
||||||
<attribute name="dir"/>
|
<attribute name="dir"/>
|
||||||
<attribute name="name"/>
|
<attribute name="name"/>
|
||||||
<sequential>
|
<sequential>
|
||||||
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
|
<!-- The default skeleton is specified here to work around a JFlex ant task bug: -->
|
||||||
|
<!-- invocations with a non-default skeleton will cause following invocations to -->
|
||||||
|
<!-- use the same skeleton, though not specified, unless the default is configured. -->
|
||||||
|
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
|
||||||
|
skeleton="${common.dir}/core/src/data/jflex/skeleton.default"/>
|
||||||
</sequential>
|
</sequential>
|
||||||
</macrodef>
|
</macrodef>
|
||||||
|
|
||||||
|
@ -2653,20 +2657,13 @@ The following arguments can be provided to ant to alter its behaviour and target
|
||||||
<attribute name="dir"/>
|
<attribute name="dir"/>
|
||||||
<attribute name="name"/>
|
<attribute name="name"/>
|
||||||
<sequential>
|
<sequential>
|
||||||
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
|
|
||||||
<!-- LUCENE-5897: Disallow scanner buffer expansion -->
|
<!-- LUCENE-5897: Disallow scanner buffer expansion -->
|
||||||
<replaceregexp file="@{dir}/@{name}.java"
|
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
|
||||||
match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
|
skeleton="${common.dir}/core/src/data/jflex/skeleton.disable.buffer.expansion.txt"/>
|
||||||
replace="" flags="s" />
|
<!-- Since the ZZ_BUFFERSIZE declaration is generated rather than in the skeleton, we have to transform it here. -->
|
||||||
<replaceregexp file="@{dir}/@{name}.java"
|
<replaceregexp file="@{dir}/@{name}.java"
|
||||||
match="private static final int ZZ_BUFFERSIZE ="
|
match="private static final int ZZ_BUFFERSIZE ="
|
||||||
replace="private int ZZ_BUFFERSIZE ="/>
|
replace="private int ZZ_BUFFERSIZE ="/>
|
||||||
<replaceregexp file="@{dir}/@{name}.java"
|
|
||||||
match="int requested = zzBuffer.length - zzEndRead;"
|
|
||||||
replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
|
|
||||||
<replaceregexp file="@{dir}/@{name}.java"
|
|
||||||
match="(zzFinalHighSurrogate = 1;)(\r?\n)"
|
|
||||||
replace="\1\2 if (totalRead == 1) { return true; }\2"/>
|
|
||||||
</sequential>
|
</sequential>
|
||||||
</macrodef>
|
</macrodef>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This file was automatically generated by getUnicodeEmojiProperties.pl
|
||||||
|
// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt
|
||||||
|
|
||||||
|
Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
|
||||||
|
Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
|
||||||
|
Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
|
||||||
|
Extended_Pictographic = [\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{2388}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2605}\u{2607}-\u{2612}\u{2614}-\u{2685}\u{2690}-\u{2705}\u{2708}-\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2767}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F000}-\u{1F0FF}\u{1F10D}-\u{1F10F}\u{1F12F}\u{1F16C}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AD}-\u{1F1E5}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F3FA}\u{1F400}-\u{1F53D}\u{1F546}-\u{1F64F}\u{1F680}-\u{1F6FF}\u{1F774}-\u{1F77F}\u{1F7D5}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1FFFD}]
|
||||||
|
|
|
@ -0,0 +1,168 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
use File::Spec;
|
||||||
|
use Getopt::Long;
|
||||||
|
use LWP::UserAgent;
|
||||||
|
|
||||||
|
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
|
||||||
|
|
||||||
|
my $version = '';
|
||||||
|
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
|
||||||
|
print STDERR "Usage: $script_name -v <version>\n";
|
||||||
|
print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
|
||||||
|
if ($version);
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
|
||||||
|
my $output_filename = "UnicodeEmojiProperties.jflex";
|
||||||
|
my $header =<<"__HEADER__";
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This file was automatically generated by ${script_name}
|
||||||
|
// from: ${emoji_data_url}
|
||||||
|
|
||||||
|
__HEADER__
|
||||||
|
|
||||||
|
my $property_ranges = {};
|
||||||
|
my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
|
||||||
|
|
||||||
|
parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
|
||||||
|
|
||||||
|
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
||||||
|
output_jflex_include_file($output_path, $property_ranges);
|
||||||
|
|
||||||
|
|
||||||
|
# sub parse_emoji_data_file
|
||||||
|
#
|
||||||
|
# Downloads and parses the emoji_data.txt file, extracting code point ranges
|
||||||
|
# assigned to property values with age not younger than the passed-in version,
|
||||||
|
# except for the Extended_Pictographic property, for which all code point ranges
|
||||||
|
# are extracted, regardless of age.
|
||||||
|
#
|
||||||
|
# Parameters:
|
||||||
|
#
|
||||||
|
# - Emoji data file URL
|
||||||
|
# - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
|
||||||
|
# - Reference to hash of wanted property names
|
||||||
|
#
|
||||||
|
sub parse_emoji_data_file {
|
||||||
|
my $url = shift;
|
||||||
|
my $prop_ranges = shift;
|
||||||
|
my $wanted_props = shift;
|
||||||
|
my $content = get_URL_content($url);
|
||||||
|
print STDERR "Parsing '$url'...";
|
||||||
|
my @lines = split /\r?\n/, $content;
|
||||||
|
for (@lines) {
|
||||||
|
## 231A..231B ; Emoji_Presentation # 1.1 [2] (⌚..⌛) watch..hourglass done
|
||||||
|
## 1F9C0 ; Emoji_Presentation # 8.0 [1] (🧀) cheese wedge
|
||||||
|
## 1FA00..1FA5F ; Extended_Pictographic# NA [96] (🨀️..️) <reserved-1FA00>..<reserved-1FA5F>
|
||||||
|
if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
|
||||||
|
next unless defined($wanted_props->{$prop}); # Skip unless we want ranges for this property
|
||||||
|
|
||||||
|
if (not defined($prop_ranges->{$prop})) {
|
||||||
|
$prop_ranges->{$prop} = [];
|
||||||
|
}
|
||||||
|
$end = $start unless defined($end);
|
||||||
|
my $start_dec = hex $start;
|
||||||
|
my $end_dec = hex $end;
|
||||||
|
my $ranges = $prop_ranges->{$prop};
|
||||||
|
if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
|
||||||
|
# print STDERR "Adding new range ($start, $end)\n";
|
||||||
|
push @$ranges, $start_dec, $end_dec;
|
||||||
|
} else {
|
||||||
|
# printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
|
||||||
|
$ranges->[-1] = $end_dec;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# print STDERR "Skipping line (no data): $_\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print STDERR "done.\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
# sub get_URL_content
|
||||||
|
#
|
||||||
|
# Retrieves and returns the content of the given URL.
|
||||||
|
#
|
||||||
|
# Parameter:
|
||||||
|
#
|
||||||
|
# - URL to get content for
|
||||||
|
#
|
||||||
|
sub get_URL_content {
|
||||||
|
my $url = shift;
|
||||||
|
print STDERR "Retrieving '$url'...";
|
||||||
|
my $user_agent = LWP::UserAgent->new;
|
||||||
|
my $request = HTTP::Request->new(GET => $url);
|
||||||
|
my $response = $user_agent->request($request);
|
||||||
|
unless ($response->is_success) {
|
||||||
|
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
print STDERR "done.\n";
|
||||||
|
return $response->content;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# sub output_jflex_include_file
|
||||||
|
#
|
||||||
|
# Parameters:
|
||||||
|
#
|
||||||
|
# - Output path
|
||||||
|
# - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
|
||||||
|
#
|
||||||
|
sub output_jflex_include_file {
|
||||||
|
my $path = shift;
|
||||||
|
my $prop_ranges = shift;
|
||||||
|
open OUT, ">$path"
|
||||||
|
|| die "Error opening '$path' for writing: $!";
|
||||||
|
|
||||||
|
print STDERR "Writing '$path'...";
|
||||||
|
|
||||||
|
print OUT $header;
|
||||||
|
|
||||||
|
for my $prop (sort keys %$prop_ranges) {
|
||||||
|
my $ranges = $prop_ranges->{$prop};
|
||||||
|
print OUT "$prop = [";
|
||||||
|
for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
|
||||||
|
printf OUT "\\u{%X}", $ranges->[$index];
|
||||||
|
printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
|
||||||
|
}
|
||||||
|
print OUT "]\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
print OUT "\n";
|
||||||
|
close OUT;
|
||||||
|
print STDERR "done.\n";
|
||||||
|
}
|
|
@ -0,0 +1,342 @@
|
||||||
|
|
||||||
|
/** This character denotes the end of file */
|
||||||
|
public static final int YYEOF = -1;
|
||||||
|
|
||||||
|
/** initial size of the lookahead buffer */
|
||||||
|
--- private static final int ZZ_BUFFERSIZE = ...;
|
||||||
|
|
||||||
|
/** lexical states */
|
||||||
|
--- lexical states, charmap
|
||||||
|
|
||||||
|
/* error codes */
|
||||||
|
private static final int ZZ_UNKNOWN_ERROR = 0;
|
||||||
|
private static final int ZZ_NO_MATCH = 1;
|
||||||
|
private static final int ZZ_PUSHBACK_2BIG = 2;
|
||||||
|
|
||||||
|
/* error messages for the codes above */
|
||||||
|
private static final String ZZ_ERROR_MSG[] = {
|
||||||
|
"Unknown internal scanner error",
|
||||||
|
"Error: could not match input",
|
||||||
|
"Error: pushback value was too large"
|
||||||
|
};
|
||||||
|
|
||||||
|
--- isFinal list
|
||||||
|
/** the input device */
|
||||||
|
private java.io.Reader zzReader;
|
||||||
|
|
||||||
|
/** the current state of the DFA */
|
||||||
|
private int zzState;
|
||||||
|
|
||||||
|
/** the current lexical state */
|
||||||
|
private int zzLexicalState = YYINITIAL;
|
||||||
|
|
||||||
|
/** this buffer contains the current text to be matched and is
|
||||||
|
the source of the yytext() string */
|
||||||
|
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
|
||||||
|
|
||||||
|
/** the textposition at the last accepting state */
|
||||||
|
private int zzMarkedPos;
|
||||||
|
|
||||||
|
/** the current text position in the buffer */
|
||||||
|
private int zzCurrentPos;
|
||||||
|
|
||||||
|
/** startRead marks the beginning of the yytext() string in the buffer */
|
||||||
|
private int zzStartRead;
|
||||||
|
|
||||||
|
/** endRead marks the last character in the buffer, that has been read
|
||||||
|
from input */
|
||||||
|
private int zzEndRead;
|
||||||
|
|
||||||
|
/** number of newlines encountered up to the start of the matched text */
|
||||||
|
private int yyline;
|
||||||
|
|
||||||
|
/** the number of characters up to the start of the matched text */
|
||||||
|
private int yychar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the number of characters from the last newline up to the start of the
|
||||||
|
* matched text
|
||||||
|
*/
|
||||||
|
private int yycolumn;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* zzAtBOL == true iff the scanner is currently at the beginning of a line
|
||||||
|
*/
|
||||||
|
private boolean zzAtBOL = true;
|
||||||
|
|
||||||
|
/** zzAtEOF == true iff the scanner is at the EOF */
|
||||||
|
private boolean zzAtEOF;
|
||||||
|
|
||||||
|
/** denotes if the user-EOF-code has already been executed */
|
||||||
|
private boolean zzEOFDone;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of occupied positions in zzBuffer beyond zzEndRead.
|
||||||
|
* When a lead/high surrogate has been read from the input stream
|
||||||
|
* into the final zzBuffer position, this will have a value of 1;
|
||||||
|
* otherwise, it will have a value of 0.
|
||||||
|
*/
|
||||||
|
private int zzFinalHighSurrogate = 0;
|
||||||
|
|
||||||
|
--- user class code
|
||||||
|
|
||||||
|
--- constructor declaration
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Refills the input buffer.
|
||||||
|
*
|
||||||
|
* @return <code>false</code>, iff there was new input.
|
||||||
|
*
|
||||||
|
* @exception java.io.IOException if any I/O-Error occurs
|
||||||
|
*/
|
||||||
|
private boolean zzRefill() throws java.io.IOException {
|
||||||
|
|
||||||
|
/* first: make room (if you can) */
|
||||||
|
if (zzStartRead > 0) {
|
||||||
|
zzEndRead += zzFinalHighSurrogate;
|
||||||
|
zzFinalHighSurrogate = 0;
|
||||||
|
System.arraycopy(zzBuffer, zzStartRead,
|
||||||
|
zzBuffer, 0,
|
||||||
|
zzEndRead-zzStartRead);
|
||||||
|
|
||||||
|
/* translate stored positions */
|
||||||
|
zzEndRead-= zzStartRead;
|
||||||
|
zzCurrentPos-= zzStartRead;
|
||||||
|
zzMarkedPos-= zzStartRead;
|
||||||
|
zzStartRead = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* is the buffer big enough? */
|
||||||
|
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
|
||||||
|
/* if not: blow it up */
|
||||||
|
char newBuffer[] = new char[zzBuffer.length*2];
|
||||||
|
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
||||||
|
zzBuffer = newBuffer;
|
||||||
|
zzEndRead += zzFinalHighSurrogate;
|
||||||
|
zzFinalHighSurrogate = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* fill the buffer with new input */
|
||||||
|
int requested = zzBuffer.length - zzEndRead;
|
||||||
|
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
|
||||||
|
|
||||||
|
/* not supposed to occur according to specification of java.io.Reader */
|
||||||
|
if (numRead == 0) {
|
||||||
|
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
|
||||||
|
}
|
||||||
|
if (numRead > 0) {
|
||||||
|
zzEndRead += numRead;
|
||||||
|
/* If numRead == requested, we might have requested to few chars to
|
||||||
|
encode a full Unicode character. We assume that a Reader would
|
||||||
|
otherwise never return half characters. */
|
||||||
|
if (numRead == requested) {
|
||||||
|
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||||
|
--zzEndRead;
|
||||||
|
zzFinalHighSurrogate = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* potentially more input available */
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* numRead < 0 ==> end of stream */
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the input stream.
|
||||||
|
*/
|
||||||
|
public final void yyclose() throws java.io.IOException {
|
||||||
|
zzAtEOF = true; /* indicate end of file */
|
||||||
|
zzEndRead = zzStartRead; /* invalidate buffer */
|
||||||
|
|
||||||
|
if (zzReader != null)
|
||||||
|
zzReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets the scanner to read from a new input stream.
|
||||||
|
* Does not close the old reader.
|
||||||
|
*
|
||||||
|
* All internal variables are reset, the old input stream
|
||||||
|
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||||
|
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||||
|
*
|
||||||
|
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||||
|
*
|
||||||
|
* @param reader the new input stream
|
||||||
|
*/
|
||||||
|
public final void yyreset(java.io.Reader reader) {
|
||||||
|
zzReader = reader;
|
||||||
|
zzAtBOL = true;
|
||||||
|
zzAtEOF = false;
|
||||||
|
zzEOFDone = false;
|
||||||
|
zzEndRead = zzStartRead = 0;
|
||||||
|
zzCurrentPos = zzMarkedPos = 0;
|
||||||
|
zzFinalHighSurrogate = 0;
|
||||||
|
yyline = yychar = yycolumn = 0;
|
||||||
|
zzLexicalState = YYINITIAL;
|
||||||
|
if (zzBuffer.length > ZZ_BUFFERSIZE)
|
||||||
|
zzBuffer = new char[ZZ_BUFFERSIZE];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current lexical state.
|
||||||
|
*/
|
||||||
|
public final int yystate() {
|
||||||
|
return zzLexicalState;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enters a new lexical state
|
||||||
|
*
|
||||||
|
* @param newState the new lexical state
|
||||||
|
*/
|
||||||
|
public final void yybegin(int newState) {
|
||||||
|
zzLexicalState = newState;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the text matched by the current regular expression.
|
||||||
|
*/
|
||||||
|
public final String yytext() {
|
||||||
|
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the character at position <tt>pos</tt> from the
|
||||||
|
* matched text.
|
||||||
|
*
|
||||||
|
* It is equivalent to yytext().charAt(pos), but faster
|
||||||
|
*
|
||||||
|
* @param pos the position of the character to fetch.
|
||||||
|
* A value from 0 to yylength()-1.
|
||||||
|
*
|
||||||
|
* @return the character at position pos
|
||||||
|
*/
|
||||||
|
public final char yycharat(int pos) {
|
||||||
|
return zzBuffer[zzStartRead+pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the length of the matched text region.
|
||||||
|
*/
|
||||||
|
public final int yylength() {
|
||||||
|
return zzMarkedPos-zzStartRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reports an error that occured while scanning.
|
||||||
|
*
|
||||||
|
* In a wellformed scanner (no or only correct usage of
|
||||||
|
* yypushback(int) and a match-all fallback rule) this method
|
||||||
|
* will only be called with things that "Can't Possibly Happen".
|
||||||
|
* If this method is called, something is seriously wrong
|
||||||
|
* (e.g. a JFlex bug producing a faulty scanner etc.).
|
||||||
|
*
|
||||||
|
* Usual syntax/scanner level error handling should be done
|
||||||
|
* in error fallback rules.
|
||||||
|
*
|
||||||
|
* @param errorCode the code of the errormessage to display
|
||||||
|
*/
|
||||||
|
--- zzScanError declaration
|
||||||
|
String message;
|
||||||
|
try {
|
||||||
|
message = ZZ_ERROR_MSG[errorCode];
|
||||||
|
}
|
||||||
|
catch (ArrayIndexOutOfBoundsException e) {
|
||||||
|
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
|
||||||
|
}
|
||||||
|
|
||||||
|
--- throws clause
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pushes the specified amount of characters back into the input stream.
|
||||||
|
*
|
||||||
|
* They will be read again by then next call of the scanning method
|
||||||
|
*
|
||||||
|
* @param number the number of characters to be read again.
|
||||||
|
* This number must not be greater than yylength()!
|
||||||
|
*/
|
||||||
|
--- yypushback decl (contains zzScanError exception)
|
||||||
|
if ( number > yylength() )
|
||||||
|
zzScanError(ZZ_PUSHBACK_2BIG);
|
||||||
|
|
||||||
|
zzMarkedPos -= number;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
--- zzDoEOF
|
||||||
|
/**
|
||||||
|
* Resumes scanning until the next regular expression is matched,
|
||||||
|
* the end of input is encountered or an I/O-Error occurs.
|
||||||
|
*
|
||||||
|
* @return the next token
|
||||||
|
* @exception java.io.IOException if any I/O-Error occurs
|
||||||
|
*/
|
||||||
|
--- yylex declaration
|
||||||
|
int zzInput;
|
||||||
|
int zzAction;
|
||||||
|
|
||||||
|
// cached fields:
|
||||||
|
int zzCurrentPosL;
|
||||||
|
int zzMarkedPosL;
|
||||||
|
int zzEndReadL = zzEndRead;
|
||||||
|
char [] zzBufferL = zzBuffer;
|
||||||
|
char [] zzCMapL = ZZ_CMAP;
|
||||||
|
|
||||||
|
--- local declarations
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
zzMarkedPosL = zzMarkedPos;
|
||||||
|
|
||||||
|
--- start admin (line, char, col count)
|
||||||
|
zzAction = -1;
|
||||||
|
|
||||||
|
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
|
||||||
|
|
||||||
|
--- start admin (lexstate etc)
|
||||||
|
|
||||||
|
zzForAction: {
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
--- next input, line, col, char count, next transition, isFinal action
|
||||||
|
zzAction = zzState;
|
||||||
|
zzMarkedPosL = zzCurrentPosL;
|
||||||
|
--- line count update
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// store back cached position
|
||||||
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
--- char count update
|
||||||
|
|
||||||
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
|
zzAtEOF = true;
|
||||||
|
--- eofvalue
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
--- actions
|
||||||
|
default:
|
||||||
|
--- no match
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
--- main
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,348 @@
|
||||||
|
|
||||||
|
/** This character denotes the end of file */
|
||||||
|
public static final int YYEOF = -1;
|
||||||
|
|
||||||
|
/** initial size of the lookahead buffer */
|
||||||
|
--- private static final int ZZ_BUFFERSIZE = ...;
|
||||||
|
|
||||||
|
/** lexical states */
|
||||||
|
--- lexical states, charmap
|
||||||
|
|
||||||
|
/* error codes */
|
||||||
|
private static final int ZZ_UNKNOWN_ERROR = 0;
|
||||||
|
private static final int ZZ_NO_MATCH = 1;
|
||||||
|
private static final int ZZ_PUSHBACK_2BIG = 2;
|
||||||
|
|
||||||
|
/* error messages for the codes above */
|
||||||
|
private static final String ZZ_ERROR_MSG[] = {
|
||||||
|
"Unknown internal scanner error",
|
||||||
|
"Error: could not match input",
|
||||||
|
"Error: pushback value was too large"
|
||||||
|
};
|
||||||
|
|
||||||
|
--- isFinal list
|
||||||
|
/** the input device */
|
||||||
|
private java.io.Reader zzReader;
|
||||||
|
|
||||||
|
/** the current state of the DFA */
|
||||||
|
private int zzState;
|
||||||
|
|
||||||
|
/** the current lexical state */
|
||||||
|
private int zzLexicalState = YYINITIAL;
|
||||||
|
|
||||||
|
/** this buffer contains the current text to be matched and is
|
||||||
|
the source of the yytext() string */
|
||||||
|
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
|
||||||
|
|
||||||
|
/** the textposition at the last accepting state */
|
||||||
|
private int zzMarkedPos;
|
||||||
|
|
||||||
|
/** the current text position in the buffer */
|
||||||
|
private int zzCurrentPos;
|
||||||
|
|
||||||
|
/** startRead marks the beginning of the yytext() string in the buffer */
|
||||||
|
private int zzStartRead;
|
||||||
|
|
||||||
|
/** endRead marks the last character in the buffer, that has been read
|
||||||
|
from input */
|
||||||
|
private int zzEndRead;
|
||||||
|
|
||||||
|
/** number of newlines encountered up to the start of the matched text */
|
||||||
|
private int yyline;
|
||||||
|
|
||||||
|
/** the number of characters up to the start of the matched text */
|
||||||
|
private int yychar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the number of characters from the last newline up to the start of the
|
||||||
|
* matched text
|
||||||
|
*/
|
||||||
|
private int yycolumn;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* zzAtBOL == true iff the scanner is currently at the beginning of a line
|
||||||
|
*/
|
||||||
|
private boolean zzAtBOL = true;
|
||||||
|
|
||||||
|
/** zzAtEOF == true iff the scanner is at the EOF */
|
||||||
|
private boolean zzAtEOF;
|
||||||
|
|
||||||
|
/** denotes if the user-EOF-code has already been executed */
|
||||||
|
private boolean zzEOFDone;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of occupied positions in zzBuffer beyond zzEndRead.
|
||||||
|
* When a lead/high surrogate has been read from the input stream
|
||||||
|
* into the final zzBuffer position, this will have a value of 1;
|
||||||
|
* otherwise, it will have a value of 0.
|
||||||
|
*/
|
||||||
|
private int zzFinalHighSurrogate = 0;
|
||||||
|
|
||||||
|
--- user class code
|
||||||
|
|
||||||
|
--- constructor declaration
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------------------- */
|
||||||
|
/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Refills the input buffer.
|
||||||
|
*
|
||||||
|
* @return <code>false</code>, iff there was new input.
|
||||||
|
*
|
||||||
|
* @exception java.io.IOException if any I/O-Error occurs
|
||||||
|
*/
|
||||||
|
private boolean zzRefill() throws java.io.IOException {
|
||||||
|
|
||||||
|
/* first: make room (if you can) */
|
||||||
|
if (zzStartRead > 0) {
|
||||||
|
zzEndRead += zzFinalHighSurrogate;
|
||||||
|
zzFinalHighSurrogate = 0;
|
||||||
|
System.arraycopy(zzBuffer, zzStartRead,
|
||||||
|
zzBuffer, 0,
|
||||||
|
zzEndRead-zzStartRead);
|
||||||
|
|
||||||
|
/* translate stored positions */
|
||||||
|
zzEndRead-= zzStartRead;
|
||||||
|
zzCurrentPos-= zzStartRead;
|
||||||
|
zzMarkedPos-= zzStartRead;
|
||||||
|
zzStartRead = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* fill the buffer with new input */
|
||||||
|
int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
|
||||||
|
if (requested == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
|
||||||
|
|
||||||
|
/* not supposed to occur according to specification of java.io.Reader */
|
||||||
|
if (numRead == 0) {
|
||||||
|
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
|
||||||
|
}
|
||||||
|
if (numRead > 0) {
|
||||||
|
zzEndRead += numRead;
|
||||||
|
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||||
|
if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
|
||||||
|
--zzEndRead;
|
||||||
|
zzFinalHighSurrogate = 1;
|
||||||
|
if (numRead == 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else { // There is room in the buffer for at least one more char
|
||||||
|
int c = zzReader.read(); // Expecting to read a low surrogate char
|
||||||
|
if (c == -1) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
zzBuffer[zzEndRead++] = (char)c;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* potentially more input available */
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* numRead < 0 ==> end of stream */
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
|
||||||
|
/* ------------------------------------------------------------------------------ */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the input stream.
|
||||||
|
*/
|
||||||
|
public final void yyclose() throws java.io.IOException {
|
||||||
|
zzAtEOF = true; /* indicate end of file */
|
||||||
|
zzEndRead = zzStartRead; /* invalidate buffer */
|
||||||
|
|
||||||
|
if (zzReader != null)
|
||||||
|
zzReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets the scanner to read from a new input stream.
|
||||||
|
* Does not close the old reader.
|
||||||
|
*
|
||||||
|
* All internal variables are reset, the old input stream
|
||||||
|
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||||
|
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||||
|
*
|
||||||
|
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||||
|
*
|
||||||
|
* @param reader the new input stream
|
||||||
|
*/
|
||||||
|
public final void yyreset(java.io.Reader reader) {
|
||||||
|
zzReader = reader;
|
||||||
|
zzAtBOL = true;
|
||||||
|
zzAtEOF = false;
|
||||||
|
zzEOFDone = false;
|
||||||
|
zzEndRead = zzStartRead = 0;
|
||||||
|
zzCurrentPos = zzMarkedPos = 0;
|
||||||
|
zzFinalHighSurrogate = 0;
|
||||||
|
yyline = yychar = yycolumn = 0;
|
||||||
|
zzLexicalState = YYINITIAL;
|
||||||
|
if (zzBuffer.length > ZZ_BUFFERSIZE)
|
||||||
|
zzBuffer = new char[ZZ_BUFFERSIZE];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current lexical state.
|
||||||
|
*/
|
||||||
|
public final int yystate() {
|
||||||
|
return zzLexicalState;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enters a new lexical state
|
||||||
|
*
|
||||||
|
* @param newState the new lexical state
|
||||||
|
*/
|
||||||
|
public final void yybegin(int newState) {
|
||||||
|
zzLexicalState = newState;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the text matched by the current regular expression.
|
||||||
|
*/
|
||||||
|
public final String yytext() {
|
||||||
|
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the character at position <tt>pos</tt> from the
|
||||||
|
* matched text.
|
||||||
|
*
|
||||||
|
* It is equivalent to yytext().charAt(pos), but faster
|
||||||
|
*
|
||||||
|
* @param pos the position of the character to fetch.
|
||||||
|
* A value from 0 to yylength()-1.
|
||||||
|
*
|
||||||
|
* @return the character at position pos
|
||||||
|
*/
|
||||||
|
public final char yycharat(int pos) {
|
||||||
|
return zzBuffer[zzStartRead+pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the length of the matched text region.
|
||||||
|
*/
|
||||||
|
public final int yylength() {
|
||||||
|
return zzMarkedPos-zzStartRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reports an error that occured while scanning.
|
||||||
|
*
|
||||||
|
* In a wellformed scanner (no or only correct usage of
|
||||||
|
* yypushback(int) and a match-all fallback rule) this method
|
||||||
|
* will only be called with things that "Can't Possibly Happen".
|
||||||
|
* If this method is called, something is seriously wrong
|
||||||
|
* (e.g. a JFlex bug producing a faulty scanner etc.).
|
||||||
|
*
|
||||||
|
* Usual syntax/scanner level error handling should be done
|
||||||
|
* in error fallback rules.
|
||||||
|
*
|
||||||
|
* @param errorCode the code of the errormessage to display
|
||||||
|
*/
|
||||||
|
--- zzScanError declaration
|
||||||
|
String message;
|
||||||
|
try {
|
||||||
|
message = ZZ_ERROR_MSG[errorCode];
|
||||||
|
}
|
||||||
|
catch (ArrayIndexOutOfBoundsException e) {
|
||||||
|
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
|
||||||
|
}
|
||||||
|
|
||||||
|
--- throws clause
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pushes the specified amount of characters back into the input stream.
|
||||||
|
*
|
||||||
|
* They will be read again by then next call of the scanning method
|
||||||
|
*
|
||||||
|
* @param number the number of characters to be read again.
|
||||||
|
* This number must not be greater than yylength()!
|
||||||
|
*/
|
||||||
|
--- yypushback decl (contains zzScanError exception)
|
||||||
|
if ( number > yylength() )
|
||||||
|
zzScanError(ZZ_PUSHBACK_2BIG);
|
||||||
|
|
||||||
|
zzMarkedPos -= number;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
--- zzDoEOF
|
||||||
|
/**
|
||||||
|
* Resumes scanning until the next regular expression is matched,
|
||||||
|
* the end of input is encountered or an I/O-Error occurs.
|
||||||
|
*
|
||||||
|
* @return the next token
|
||||||
|
* @exception java.io.IOException if any I/O-Error occurs
|
||||||
|
*/
|
||||||
|
--- yylex declaration
|
||||||
|
int zzInput;
|
||||||
|
int zzAction;
|
||||||
|
|
||||||
|
// cached fields:
|
||||||
|
int zzCurrentPosL;
|
||||||
|
int zzMarkedPosL;
|
||||||
|
int zzEndReadL = zzEndRead;
|
||||||
|
char [] zzBufferL = zzBuffer;
|
||||||
|
char [] zzCMapL = ZZ_CMAP;
|
||||||
|
|
||||||
|
--- local declarations
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
zzMarkedPosL = zzMarkedPos;
|
||||||
|
|
||||||
|
--- start admin (line, char, col count)
|
||||||
|
zzAction = -1;
|
||||||
|
|
||||||
|
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
|
||||||
|
|
||||||
|
--- start admin (lexstate etc)
|
||||||
|
|
||||||
|
zzForAction: {
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
--- next input, line, col, char count, next transition, isFinal action
|
||||||
|
zzAction = zzState;
|
||||||
|
zzMarkedPosL = zzCurrentPosL;
|
||||||
|
--- line count update
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// store back cached position
|
||||||
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
--- char count update
|
||||||
|
|
||||||
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
|
zzAtEOF = true;
|
||||||
|
--- eofvalue
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
--- actions
|
||||||
|
default:
|
||||||
|
--- no match
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
--- main
|
||||||
|
|
||||||
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.6.0 */
|
/* The following code was generated by JFlex 1.7.0 */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
* <li><HIRAGANA>: A single hiragana character</li>
|
* <li><HIRAGANA>: A single hiragana character</li>
|
||||||
* <li><KATAKANA>: A sequence of katakana characters</li>
|
* <li><KATAKANA>: A sequence of katakana characters</li>
|
||||||
* <li><HANGUL>: A sequence of Hangul characters</li>
|
* <li><HANGUL>: A sequence of Hangul characters</li>
|
||||||
|
* <li><EMOJI>: A sequence of Emoji characters</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("fallthrough")
|
@SuppressWarnings("fallthrough")
|
||||||
|
@ -65,147 +66,212 @@ public final class StandardTokenizerImpl {
|
||||||
* Translates characters to character classes
|
* Translates characters to character classes
|
||||||
*/
|
*/
|
||||||
private static final String ZZ_CMAP_PACKED =
|
private static final String ZZ_CMAP_PACKED =
|
||||||
"\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
|
"\42\0\1\32\1\7\3\0\1\31\2\0\1\7\1\0\1\24\1\0"+
|
||||||
"\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
|
"\1\25\1\0\12\21\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
|
||||||
"\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
|
"\32\15\56\0\1\4\1\15\2\0\1\5\1\4\6\0\1\15\1\0"+
|
||||||
"\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
|
"\1\23\2\0\1\15\5\0\27\15\1\0\37\15\1\0\u01ca\15\4\0"+
|
||||||
"\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
|
"\14\15\5\0\1\23\10\0\5\15\7\0\1\15\1\0\1\15\21\0"+
|
||||||
"\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
|
"\160\5\5\15\1\0\2\15\2\0\4\15\1\24\1\15\6\0\1\15"+
|
||||||
"\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
|
"\1\23\3\15\1\0\1\15\1\0\24\15\1\0\123\15\1\0\213\15"+
|
||||||
"\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
|
"\1\0\7\5\246\15\1\0\46\15\2\0\1\15\7\0\47\15\1\0"+
|
||||||
"\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
|
"\1\24\7\0\55\5\1\0\1\5\1\0\2\5\1\0\2\5\1\0"+
|
||||||
"\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
|
"\1\5\10\0\33\33\5\0\3\33\1\15\1\23\13\0\6\5\6\0"+
|
||||||
"\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
|
"\2\24\2\0\13\5\1\0\1\5\3\0\53\15\25\5\12\20\1\0"+
|
||||||
"\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
|
"\1\20\1\24\1\0\2\15\1\5\143\15\1\0\1\15\10\5\1\0"+
|
||||||
"\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
|
"\6\5\2\15\2\5\1\0\4\5\2\15\12\20\3\15\2\0\1\15"+
|
||||||
"\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
|
"\17\0\1\5\1\15\1\5\36\15\33\5\2\0\131\15\13\5\1\15"+
|
||||||
"\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
|
"\16\0\12\20\41\15\11\5\2\15\2\0\1\24\1\0\1\15\5\0"+
|
||||||
"\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
|
"\26\15\4\5\1\15\11\5\1\15\3\5\1\15\5\5\22\0\31\15"+
|
||||||
"\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
|
"\3\5\104\0\25\15\1\0\10\15\26\0\60\5\66\15\3\5\1\15"+
|
||||||
"\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
|
"\22\5\1\15\7\5\12\15\2\5\2\0\12\20\1\0\20\15\3\5"+
|
||||||
"\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
|
"\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\1\15"+
|
||||||
"\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
|
"\3\0\4\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5"+
|
||||||
"\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
|
"\1\15\10\0\1\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20"+
|
||||||
"\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
|
"\2\15\17\0\3\5\1\0\6\15\4\0\2\15\2\0\26\15\1\0"+
|
||||||
"\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
|
"\7\15\1\0\2\15\1\0\2\15\1\0\2\15\2\0\1\5\1\0"+
|
||||||
"\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
|
"\5\5\4\0\2\5\2\0\3\5\3\0\1\5\7\0\4\15\1\0"+
|
||||||
"\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
|
"\1\15\7\0\12\20\2\5\3\15\1\5\13\0\3\5\1\0\11\15"+
|
||||||
"\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
|
"\1\0\3\15\1\0\26\15\1\0\7\15\1\0\2\15\1\0\5\15"+
|
||||||
"\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
|
"\2\0\1\5\1\15\10\5\1\0\3\5\1\0\3\5\2\0\1\15"+
|
||||||
"\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
|
"\17\0\2\15\2\5\2\0\12\20\11\0\1\15\7\0\3\5\1\0"+
|
||||||
"\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
|
"\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\2\15\1\0"+
|
||||||
"\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
|
"\5\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5\10\0"+
|
||||||
"\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
|
"\2\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20\1\0\1\15"+
|
||||||
"\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
|
"\20\0\1\5\1\15\1\0\6\15\3\0\3\15\1\0\4\15\3\0"+
|
||||||
"\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
|
"\2\15\1\0\1\15\1\0\2\15\3\0\2\15\3\0\3\15\3\0"+
|
||||||
"\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
|
"\14\15\4\0\5\5\3\0\3\5\1\0\4\5\2\0\1\15\6\0"+
|
||||||
"\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
|
"\1\5\16\0\12\20\20\0\4\5\1\0\10\15\1\0\3\15\1\0"+
|
||||||
"\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
|
"\27\15\1\0\20\15\3\0\1\15\7\5\1\0\3\5\1\0\4\5"+
|
||||||
"\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
|
"\7\0\2\5\1\0\3\15\5\0\2\15\2\5\2\0\12\20\20\0"+
|
||||||
"\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
|
"\1\15\3\5\1\0\10\15\1\0\3\15\1\0\27\15\1\0\12\15"+
|
||||||
"\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
|
"\1\0\5\15\2\0\1\5\1\15\7\5\1\0\3\5\1\0\4\5"+
|
||||||
"\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
|
"\7\0\2\5\7\0\1\15\1\0\2\15\2\5\2\0\12\20\1\0"+
|
||||||
"\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
|
"\2\15\16\0\3\5\1\0\10\15\1\0\3\15\1\0\51\15\2\0"+
|
||||||
"\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
|
"\1\15\7\5\1\0\3\5\1\0\4\5\1\15\5\0\3\15\1\5"+
|
||||||
"\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
|
"\7\0\3\15\2\5\2\0\12\20\12\0\6\15\2\0\2\5\1\0"+
|
||||||
"\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
|
"\22\15\3\0\30\15\1\0\11\15\1\0\1\15\2\0\7\15\3\0"+
|
||||||
"\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
|
"\1\5\4\0\6\5\1\0\1\5\1\0\10\5\6\0\12\20\2\0"+
|
||||||
"\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
|
"\2\5\15\0\60\34\1\35\2\34\7\35\5\0\7\34\10\35\1\0"+
|
||||||
"\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
|
"\12\20\47\0\2\34\1\0\1\34\2\0\2\34\1\0\1\34\2\0"+
|
||||||
"\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
|
"\1\34\6\0\4\34\1\0\7\34\1\0\3\34\1\0\1\34\1\0"+
|
||||||
"\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
|
"\1\34\2\0\2\34\1\0\4\34\1\35\2\34\6\35\1\0\2\35"+
|
||||||
"\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
|
"\1\34\2\0\5\34\1\0\1\34\1\0\6\35\2\0\12\20\2\0"+
|
||||||
"\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
|
"\4\34\40\0\1\15\27\0\2\5\6\0\12\20\13\0\1\5\1\0"+
|
||||||
"\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
|
"\1\5\1\0\1\5\4\0\2\5\10\15\1\0\44\15\4\0\24\5"+
|
||||||
"\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
|
"\1\0\2\5\5\15\13\5\1\0\44\5\11\0\1\5\71\0\53\34"+
|
||||||
"\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
|
"\24\35\1\34\12\20\6\0\6\34\4\35\4\34\3\35\1\34\3\35"+
|
||||||
"\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
|
"\2\34\7\35\3\34\4\35\15\34\14\35\1\34\1\35\12\20\4\35"+
|
||||||
"\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
|
"\2\34\46\15\1\0\1\15\5\0\1\15\2\0\53\15\1\0\4\15"+
|
||||||
"\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
|
"\u0100\17\111\15\1\0\4\15\2\0\7\15\1\0\1\15\1\0\4\15"+
|
||||||
"\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
|
"\2\0\51\15\1\0\4\15\2\0\41\15\1\0\4\15\2\0\7\15"+
|
||||||
"\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
|
"\1\0\1\15\1\0\4\15\2\0\17\15\1\0\71\15\1\0\4\15"+
|
||||||
"\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
|
"\2\0\103\15\2\0\3\5\40\0\20\15\20\0\126\15\2\0\6\15"+
|
||||||
"\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
|
"\3\0\u026c\15\2\0\21\15\1\0\32\15\5\0\113\15\3\0\13\15"+
|
||||||
"\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
|
"\7\0\15\15\1\0\4\15\3\5\13\0\22\15\3\5\13\0\22\15"+
|
||||||
"\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
|
"\2\5\14\0\15\15\1\0\3\15\1\0\2\5\14\0\64\34\40\35"+
|
||||||
"\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
|
"\3\0\1\34\4\0\1\34\1\35\2\0\12\20\41\0\4\5\1\0"+
|
||||||
"\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
|
"\12\20\6\0\130\15\10\0\5\15\2\5\42\15\1\5\1\15\5\0"+
|
||||||
"\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
|
"\106\15\12\0\37\15\1\0\14\5\4\0\14\5\12\0\12\20\36\34"+
|
||||||
"\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
|
"\2\0\5\34\13\0\54\34\4\0\32\34\6\0\12\20\1\34\3\0"+
|
||||||
"\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
|
"\2\34\40\0\27\15\5\5\4\0\65\34\12\35\1\0\35\35\2\0"+
|
||||||
"\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
|
"\1\5\12\20\6\0\12\20\6\0\16\34\2\0\17\5\101\0\5\5"+
|
||||||
"\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
|
"\57\15\21\5\7\15\4\0\12\20\21\0\11\5\14\0\3\5\36\15"+
|
||||||
"\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
|
"\15\5\2\15\12\20\54\15\16\5\14\0\44\15\24\5\10\0\12\20"+
|
||||||
"\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
|
"\3\0\3\15\12\20\44\15\2\0\11\15\107\0\3\5\1\0\25\5"+
|
||||||
"\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
|
"\4\15\1\5\4\15\3\5\2\15\1\0\2\5\6\0\300\15\66\5"+
|
||||||
"\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
|
"\5\0\5\5\u0116\15\2\0\6\15\2\0\46\15\2\0\6\15\2\0"+
|
||||||
"\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
|
"\10\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\37\15\2\0"+
|
||||||
"\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
|
"\65\15\1\0\7\15\1\0\1\15\3\0\3\15\1\0\7\15\3\0"+
|
||||||
"\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
|
"\4\15\2\0\6\15\4\0\15\15\5\0\3\15\1\0\7\15\17\0"+
|
||||||
"\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
|
"\1\5\1\12\2\5\10\0\2\25\12\0\1\25\2\0\1\23\2\0"+
|
||||||
"\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
|
"\5\5\1\26\14\0\1\4\2\0\2\26\3\0\1\24\4\0\1\4"+
|
||||||
"\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
|
"\12\0\1\26\13\0\5\5\1\0\12\5\1\0\1\15\15\0\1\15"+
|
||||||
"\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
|
"\20\0\15\15\63\0\23\5\1\10\15\5\21\0\1\15\4\0\1\15"+
|
||||||
"\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
|
"\2\0\12\15\1\0\1\15\3\0\5\15\4\0\1\4\1\0\1\15"+
|
||||||
"\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
|
"\1\0\1\15\1\0\1\15\1\0\4\15\1\0\12\15\1\16\2\0"+
|
||||||
"\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
|
"\4\15\5\0\5\15\4\0\1\15\21\0\51\15\13\0\6\4\17\0"+
|
||||||
"\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
|
"\2\4\u016f\0\2\4\14\0\1\4\137\0\1\4\106\0\1\4\31\0"+
|
||||||
"\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
|
"\13\4\4\0\3\4\273\0\14\15\1\16\47\15\300\0\2\4\12\0"+
|
||||||
"\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
|
"\1\4\11\0\1\4\72\0\4\4\1\0\5\4\1\4\1\0\7\4"+
|
||||||
"\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
|
"\1\4\2\4\1\4\1\4\1\0\2\4\2\4\1\4\4\4\1\3"+
|
||||||
"\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
|
"\2\4\1\4\1\4\2\4\2\4\1\4\3\4\1\4\3\4\2\4"+
|
||||||
"\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
|
"\10\4\3\4\5\4\1\4\1\4\1\4\5\4\14\4\13\4\2\4"+
|
||||||
"\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
|
"\2\4\1\4\1\4\2\4\1\4\1\4\22\4\1\4\2\4\2\4"+
|
||||||
"\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
|
"\6\4\12\0\2\4\6\4\1\4\1\4\1\4\2\4\3\4\2\4"+
|
||||||
"\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
|
"\10\4\2\4\4\4\2\4\13\4\2\4\5\4\2\4\2\4\1\4"+
|
||||||
"\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
|
"\5\4\2\4\1\4\1\4\1\4\2\4\24\4\2\4\5\4\6\4"+
|
||||||
"\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
|
"\1\4\2\4\1\3\1\4\2\4\1\4\4\4\1\4\2\4\1\4"+
|
||||||
"\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
|
"\2\0\2\4\4\3\1\4\1\4\2\4\1\4\1\0\1\4\1\0"+
|
||||||
"\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
|
"\1\4\6\0\1\4\3\0\1\4\6\0\1\4\12\0\2\4\17\0"+
|
||||||
"\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
|
"\1\4\2\0\1\4\4\0\1\4\1\0\1\4\4\0\3\4\1\0"+
|
||||||
"\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
|
"\1\4\13\0\2\4\3\4\55\0\3\4\11\0\1\4\16\0\1\4"+
|
||||||
"\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
|
"\16\0\1\4\u0174\0\2\4\u01cf\0\3\4\23\0\2\4\63\0\1\4"+
|
||||||
"\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
|
"\4\0\1\4\252\0\57\15\1\0\57\15\1\0\205\15\6\0\4\15"+
|
||||||
"\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
|
"\3\5\2\15\14\0\46\15\1\0\1\15\5\0\1\15\2\0\70\15"+
|
||||||
"\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
|
"\7\0\1\15\17\0\1\5\27\15\11\0\7\15\1\0\7\15\1\0"+
|
||||||
"\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
|
"\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0"+
|
||||||
"\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
|
"\7\15\1\0\40\5\57\0\1\15\120\0\32\27\1\0\131\27\14\0"+
|
||||||
"\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
|
"\326\27\57\0\1\15\1\0\1\27\31\0\11\27\6\5\1\4\5\22"+
|
||||||
"\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
|
"\2\0\3\27\1\15\1\15\1\4\3\0\126\30\2\0\2\5\2\22"+
|
||||||
"\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
|
"\3\30\133\22\1\0\4\22\5\0\51\15\3\0\136\17\21\0\33\15"+
|
||||||
"\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
|
"\65\0\20\22\227\0\1\4\1\0\1\4\66\0\57\22\1\0\130\22"+
|
||||||
"\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
|
"\250\0\u19b6\27\112\0\u51d6\27\52\0\u048d\15\103\0\56\15\2\0\u010d\15"+
|
||||||
"\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
|
"\3\0\20\15\12\20\2\15\24\0\57\15\4\5\1\0\12\5\1\0"+
|
||||||
"\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
|
"\37\15\2\5\120\15\2\5\45\0\11\15\2\0\147\15\2\0\44\15"+
|
||||||
"\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
|
"\1\0\10\15\77\0\13\15\1\5\3\15\1\5\4\15\1\5\27\15"+
|
||||||
"\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
|
"\5\5\30\0\64\15\14\0\2\5\62\15\22\5\12\0\12\20\6\0"+
|
||||||
"\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
|
"\22\5\6\15\3\0\1\15\1\0\1\15\2\0\12\20\34\15\10\5"+
|
||||||
"\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
|
"\2\0\27\15\15\5\14\0\35\17\3\0\4\5\57\15\16\5\16\0"+
|
||||||
"\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
|
"\1\15\12\20\6\0\5\34\1\35\12\34\12\20\5\34\1\0\51\15"+
|
||||||
"\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
|
"\16\5\11\0\3\15\1\5\10\15\2\5\2\0\12\20\6\0\33\34"+
|
||||||
"\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
|
"\3\35\62\34\1\35\1\34\3\35\2\34\2\35\5\34\2\35\1\34"+
|
||||||
"\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
|
"\1\35\1\34\30\0\5\34\13\15\5\5\2\0\3\15\2\5\12\0"+
|
||||||
"\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
|
"\6\15\2\0\6\15\2\0\6\15\11\0\7\15\1\0\7\15\1\0"+
|
||||||
"\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
|
"\53\15\1\0\12\15\12\0\163\15\10\5\1\0\2\5\2\0\12\20"+
|
||||||
"\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
|
"\6\0\u2ba4\17\14\0\27\17\4\0\61\17\u2104\0\u016e\27\2\0\152\27"+
|
||||||
"\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
|
"\46\0\7\15\14\0\5\15\5\0\1\33\1\5\12\33\1\0\15\33"+
|
||||||
"\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
|
"\1\0\5\33\1\0\1\33\1\0\2\33\1\0\2\33\1\0\12\33"+
|
||||||
"\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
|
"\142\15\41\0\u016b\15\22\0\100\15\2\0\66\15\50\0\14\15\4\0"+
|
||||||
"\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
|
"\16\5\1\6\1\11\1\24\2\0\1\23\1\24\13\0\20\5\3\0"+
|
||||||
"\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
|
"\2\26\30\0\3\26\1\24\1\0\1\25\1\0\1\24\1\23\32\0"+
|
||||||
"\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
|
"\5\15\1\0\207\15\2\0\1\5\7\0\1\25\4\0\1\24\1\0"+
|
||||||
"\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
|
"\1\25\1\0\12\20\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
|
||||||
"\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
|
"\32\15\13\0\70\22\2\5\37\17\3\0\6\17\2\0\6\17\2\0"+
|
||||||
"\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
|
"\6\17\2\0\3\17\34\0\3\5\4\0\14\15\1\0\32\15\1\0"+
|
||||||
"\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
|
"\23\15\1\0\2\15\1\0\17\15\2\0\16\15\42\0\173\15\105\0"+
|
||||||
"\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
|
"\65\15\210\0\1\5\202\0\35\15\3\0\61\15\17\0\1\5\37\0"+
|
||||||
"\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
|
"\40\15\20\0\33\15\5\0\46\15\5\5\5\0\36\15\2\0\44\15"+
|
||||||
"\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
|
"\4\0\10\15\1\0\5\15\52\0\236\15\2\0\12\20\6\0\44\15"+
|
||||||
"\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
|
"\4\0\44\15\4\0\50\15\10\0\64\15\234\0\u0137\15\11\0\26\15"+
|
||||||
"\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
|
"\12\0\10\15\230\0\6\15\2\0\1\15\1\0\54\15\1\0\2\15"+
|
||||||
"\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
|
"\3\0\1\15\2\0\27\15\12\0\27\15\11\0\37\15\101\0\23\15"+
|
||||||
"\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
|
"\1\0\2\15\12\0\26\15\12\0\32\15\106\0\70\15\6\0\2\15"+
|
||||||
"\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
|
"\100\0\1\15\3\5\1\0\2\5\5\0\4\5\4\15\1\0\3\15"+
|
||||||
|
"\1\0\33\15\4\0\3\5\4\0\1\5\40\0\35\15\3\0\35\15"+
|
||||||
|
"\43\0\10\15\1\0\34\15\2\5\31\0\66\15\12\0\26\15\12\0"+
|
||||||
|
"\23\15\15\0\22\15\156\0\111\15\67\0\63\15\15\0\63\15\u030d\0"+
|
||||||
|
"\3\5\65\15\17\5\37\0\12\20\17\0\4\5\55\15\13\5\2\0"+
|
||||||
|
"\1\5\22\0\31\15\7\0\12\20\6\0\3\5\44\15\16\5\1\0"+
|
||||||
|
"\12\20\20\0\43\15\1\5\2\0\1\15\11\0\3\5\60\15\16\5"+
|
||||||
|
"\4\15\5\0\3\5\3\0\12\20\1\15\1\0\1\15\43\0\22\15"+
|
||||||
|
"\1\0\31\15\14\5\6\0\1\5\101\0\7\15\1\0\1\15\1\0"+
|
||||||
|
"\4\15\1\0\17\15\1\0\12\15\7\0\57\15\14\5\5\0\12\20"+
|
||||||
|
"\6\0\4\5\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15"+
|
||||||
|
"\1\0\2\15\1\0\5\15\2\0\1\5\1\15\7\5\2\0\2\5"+
|
||||||
|
"\2\0\3\5\2\0\1\15\6\0\1\5\5\0\5\15\2\5\2\0"+
|
||||||
|
"\7\5\3\0\5\5\213\0\65\15\22\5\4\15\5\0\12\20\46\0"+
|
||||||
|
"\60\15\24\5\2\15\1\0\1\15\10\0\12\20\246\0\57\15\7\5"+
|
||||||
|
"\2\0\11\5\27\0\4\15\2\5\42\0\60\15\21\5\3\0\1\15"+
|
||||||
|
"\13\0\12\20\46\0\53\15\15\5\10\0\12\20\66\0\32\34\3\0"+
|
||||||
|
"\17\35\4\0\12\20\2\34\3\0\1\34\u0160\0\100\15\12\20\25\0"+
|
||||||
|
"\1\15\u01c0\0\71\15\u0107\0\11\15\1\0\45\15\10\5\1\0\10\5"+
|
||||||
|
"\1\15\17\0\12\20\30\0\36\15\2\0\26\5\1\0\16\5\u0349\0"+
|
||||||
|
"\u039a\15\146\0\157\15\21\0\304\15\u0abc\0\u042f\15\u0fd1\0\u0247\15\u21b9\0"+
|
||||||
|
"\u0239\15\7\0\37\15\1\0\12\20\146\0\36\15\2\0\5\5\13\0"+
|
||||||
|
"\60\15\7\5\11\0\4\15\14\0\12\20\11\0\25\15\5\0\23\15"+
|
||||||
|
"\u0370\0\105\15\13\0\1\15\56\5\20\0\4\5\15\15\100\0\1\15"+
|
||||||
|
"\u401f\0\1\22\1\30\u0bfe\0\153\15\5\0\15\15\3\0\11\15\7\0"+
|
||||||
|
"\12\15\3\0\2\5\1\0\4\5\u14c1\0\5\5\3\0\26\5\2\0"+
|
||||||
|
"\7\5\36\0\4\5\224\0\3\5\u01bb\0\125\15\1\0\107\15\1\0"+
|
||||||
|
"\2\15\2\0\1\15\2\0\2\15\2\0\4\15\1\0\14\15\1\0"+
|
||||||
|
"\1\15\1\0\7\15\1\0\101\15\1\0\4\15\2\0\10\15\1\0"+
|
||||||
|
"\7\15\1\0\34\15\1\0\4\15\1\0\5\15\1\0\1\15\3\0"+
|
||||||
|
"\7\15\1\0\u0154\15\2\0\31\15\1\0\31\15\1\0\37\15\1\0"+
|
||||||
|
"\31\15\1\0\37\15\1\0\31\15\1\0\37\15\1\0\31\15\1\0"+
|
||||||
|
"\37\15\1\0\31\15\1\0\10\15\2\0\62\20\u0200\0\67\5\4\0"+
|
||||||
|
"\62\5\10\0\1\5\16\0\1\5\26\0\5\5\1\0\17\5\u0550\0"+
|
||||||
|
"\7\5\1\0\21\5\2\0\7\5\1\0\2\5\1\0\5\5\u07d5\0"+
|
||||||
|
"\305\15\13\0\7\5\51\0\104\15\7\5\5\0\12\20\u04a6\0\4\15"+
|
||||||
|
"\1\0\33\15\1\0\2\15\1\0\1\15\2\0\1\15\1\0\12\15"+
|
||||||
|
"\1\0\4\15\1\0\1\15\1\0\1\15\6\0\1\15\4\0\1\15"+
|
||||||
|
"\1\0\1\15\1\0\1\15\1\0\3\15\1\0\2\15\1\0\1\15"+
|
||||||
|
"\2\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15"+
|
||||||
|
"\1\0\2\15\1\0\1\15\2\0\4\15\1\0\7\15\1\0\4\15"+
|
||||||
|
"\1\0\4\15\1\0\1\15\1\0\12\15\1\0\21\15\5\0\3\15"+
|
||||||
|
"\1\0\5\15\1\0\21\15\u0144\0\4\4\1\4\312\4\1\4\60\4"+
|
||||||
|
"\15\0\3\4\37\0\1\4\32\15\6\0\32\15\2\0\4\4\2\16"+
|
||||||
|
"\14\15\2\16\12\15\4\0\1\4\2\0\12\4\22\0\71\4\32\1"+
|
||||||
|
"\1\30\2\4\15\4\12\0\1\4\24\0\1\4\2\0\11\4\1\0"+
|
||||||
|
"\4\4\11\0\7\4\2\4\256\4\42\4\2\4\141\4\1\3\16\4"+
|
||||||
|
"\2\4\2\4\1\4\3\4\2\4\44\4\3\3\2\4\1\3\2\4"+
|
||||||
|
"\3\3\44\4\2\4\3\4\1\4\4\4\5\2\102\4\2\3\2\4"+
|
||||||
|
"\13\3\25\4\4\3\4\4\1\3\1\4\11\3\3\4\1\3\4\4"+
|
||||||
|
"\3\3\1\4\3\3\42\4\1\3\123\4\1\4\77\4\10\0\3\4"+
|
||||||
|
"\6\4\1\4\30\4\7\4\2\4\2\4\1\4\2\3\4\4\1\3"+
|
||||||
|
"\14\4\1\4\2\4\4\4\2\4\1\3\4\4\2\3\15\4\2\4"+
|
||||||
|
"\2\4\1\4\10\4\2\4\11\4\1\4\5\4\3\4\14\4\3\4"+
|
||||||
|
"\10\4\3\4\2\4\1\4\1\4\1\4\4\4\1\4\6\4\1\4"+
|
||||||
|
"\3\4\1\4\6\4\113\4\3\3\3\4\5\3\60\0\43\4\1\3"+
|
||||||
|
"\20\4\3\3\11\4\1\3\5\4\5\4\1\4\1\3\6\4\15\4"+
|
||||||
|
"\6\4\3\4\1\4\1\4\2\4\3\4\1\4\2\4\7\4\6\4"+
|
||||||
|
"\164\0\14\4\125\0\53\4\14\0\4\4\70\0\10\4\12\0\6\4"+
|
||||||
|
"\50\0\10\4\36\0\122\4\14\0\4\4\10\4\5\3\1\4\2\3"+
|
||||||
|
"\6\4\1\3\11\4\12\3\1\4\1\0\1\4\2\3\1\4\6\4"+
|
||||||
|
"\1\0\52\4\2\4\4\4\3\4\1\4\1\4\47\4\15\4\5\4"+
|
||||||
|
"\2\3\1\4\2\3\6\4\3\4\15\4\1\4\15\3\42\4\u05fe\4"+
|
||||||
|
"\2\0\ua6d7\27\51\0\u1035\27\13\0\336\27\2\0\u1682\27\u295e\0\u021e\27"+
|
||||||
|
"\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
|
||||||
|
"\1\5\36\0\137\13\1\14\200\0\360\5\uffff\0\uffff\0\ufe12\0";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Translates characters to character classes
|
* Translates characters to character classes
|
||||||
|
@ -218,12 +284,15 @@ public final class StandardTokenizerImpl {
|
||||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||||
|
|
||||||
private static final String ZZ_ACTION_PACKED_0 =
|
private static final String ZZ_ACTION_PACKED_0 =
|
||||||
"\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
|
"\1\0\2\1\3\2\2\1\1\3\1\2\1\4\2\5"+
|
||||||
"\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
|
"\1\6\1\1\1\7\1\10\1\3\1\11\1\2\1\0"+
|
||||||
"\1\4\1\0\2\2\2\0\1\1\1\0";
|
"\4\2\1\0\1\2\2\0\1\3\1\0\1\3\2\2"+
|
||||||
|
"\1\0\1\5\1\2\1\5\1\0\2\3\1\0\2\2"+
|
||||||
|
"\2\0\1\2\1\0\2\3\5\2\1\0\1\2\1\3"+
|
||||||
|
"\3\2";
|
||||||
|
|
||||||
private static int [] zzUnpackAction() {
|
private static int [] zzUnpackAction() {
|
||||||
int [] result = new int[24];
|
int [] result = new int[61];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -248,12 +317,17 @@ public final class StandardTokenizerImpl {
|
||||||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||||
|
|
||||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||||
"\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
|
"\0\0\0\36\0\74\0\132\0\170\0\226\0\264\0\322"+
|
||||||
"\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
|
"\0\360\0\u010e\0\u012c\0\u014a\0\u0168\0\u0186\0\u01a4\0\u01c2"+
|
||||||
"\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
|
"\0\u01e0\0\u01fe\0\u021c\0\u023a\0\74\0\u0258\0\u0276\0\u0294"+
|
||||||
|
"\0\u02b2\0\264\0\u02d0\0\u02ee\0\322\0\u030c\0\u032a\0\u0348"+
|
||||||
|
"\0\u0366\0\u0384\0\u03a2\0\u03c0\0\u03de\0\u03fc\0\u01a4\0\u041a"+
|
||||||
|
"\0\u0438\0\u0456\0\u0474\0\u0492\0\u04b0\0\u04ce\0\u04ec\0\u050a"+
|
||||||
|
"\0\u0528\0\u0546\0\u0564\0\u0582\0\u05a0\0\u05be\0\u05dc\0\u05fa"+
|
||||||
|
"\0\36\0\u0618\0\360\0\u0636\0\u0654";
|
||||||
|
|
||||||
private static int [] zzUnpackRowMap() {
|
private static int [] zzUnpackRowMap() {
|
||||||
int [] result = new int[24];
|
int [] result = new int[61];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -276,33 +350,94 @@ public final class StandardTokenizerImpl {
|
||||||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||||
|
|
||||||
private static final String ZZ_TRANS_PACKED_0 =
|
private static final String ZZ_TRANS_PACKED_0 =
|
||||||
"\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
|
"\1\2\1\3\1\4\1\5\1\6\2\2\1\7\2\2"+
|
||||||
"\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
|
"\1\10\2\2\1\11\1\12\1\13\1\14\1\15\1\16"+
|
||||||
"\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
|
"\3\2\1\17\1\20\1\21\2\2\1\22\2\23\37\0"+
|
||||||
"\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
|
"\1\24\3\0\2\25\1\0\5\25\20\0\1\25\5\0"+
|
||||||
"\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
|
"\1\4\2\0\1\4\1\0\1\26\2\4\20\0\1\4"+
|
||||||
"\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
|
"\2\0\1\4\2\0\1\5\2\0\1\5\1\27\1\30"+
|
||||||
"\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
|
"\2\5\20\0\1\5\5\0\1\6\2\0\1\6\1\27"+
|
||||||
"\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
|
"\1\31\2\6\20\0\1\6\5\0\1\32\2\0\1\33"+
|
||||||
"\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
|
"\1\34\3\32\20\0\1\32\3\0\1\5\1\6\5\0"+
|
||||||
"\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
|
"\1\35\3\0\1\6\24\0\2\11\1\0\10\11\2\36"+
|
||||||
"\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
|
"\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
|
||||||
"\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
|
"\1\22\1\0\1\11\5\0\1\12\1\11\1\0\1\12"+
|
||||||
"\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
|
"\1\41\1\42\2\12\3\11\2\36\1\0\1\37\1\0"+
|
||||||
"\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
|
"\1\37\1\40\2\0\1\37\1\0\1\22\1\0\1\12"+
|
||||||
"\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
|
"\5\0\2\13\1\0\5\13\2\11\1\13\2\36\1\0"+
|
||||||
"\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
|
"\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
|
||||||
"\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
|
"\1\0\1\13\5\0\2\14\1\0\5\14\3\11\2\14"+
|
||||||
"\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
|
"\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
|
||||||
"\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
|
"\1\14\5\0\1\15\1\14\1\0\1\45\1\46\3\15"+
|
||||||
"\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
|
"\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
|
||||||
"\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
|
"\1\22\1\0\1\15\5\0\2\16\1\0\5\16\5\0"+
|
||||||
"\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
|
"\1\16\3\0\1\40\6\0\1\16\5\0\2\47\1\0"+
|
||||||
"\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
|
"\5\47\3\11\2\14\1\50\3\0\1\47\4\0\1\22"+
|
||||||
"\1\30\1\15\14\0\1\30";
|
"\1\0\1\47\5\0\2\20\1\0\5\20\20\0\1\20"+
|
||||||
|
"\5\0\2\21\1\0\5\21\20\0\1\21\5\0\2\22"+
|
||||||
|
"\1\0\5\22\3\11\2\36\1\0\1\37\1\0\1\37"+
|
||||||
|
"\1\40\2\0\1\51\1\52\1\22\1\0\1\22\5\0"+
|
||||||
|
"\2\23\1\0\5\23\17\0\2\23\5\0\2\24\1\0"+
|
||||||
|
"\5\24\20\0\1\24\2\0\1\4\1\53\1\54\1\4"+
|
||||||
|
"\2\0\1\4\1\0\1\26\2\4\1\0\1\54\16\0"+
|
||||||
|
"\1\4\12\0\1\55\1\56\24\0\1\4\1\53\1\54"+
|
||||||
|
"\1\5\2\0\1\5\1\27\1\30\2\5\1\0\1\54"+
|
||||||
|
"\16\0\1\5\2\0\1\4\1\53\1\54\1\6\2\0"+
|
||||||
|
"\1\6\1\27\1\31\2\6\1\0\1\54\16\0\1\6"+
|
||||||
|
"\5\0\1\33\2\0\1\33\1\34\3\33\20\0\1\33"+
|
||||||
|
"\10\0\1\57\32\0\2\36\1\0\5\36\3\11\2\36"+
|
||||||
|
"\2\0\2\60\1\40\2\0\1\60\1\0\1\22\1\0"+
|
||||||
|
"\1\36\5\0\2\37\1\0\5\37\3\11\13\0\1\11"+
|
||||||
|
"\1\0\1\37\5\0\2\40\1\0\5\40\3\11\2\36"+
|
||||||
|
"\1\50\3\0\1\40\4\0\1\22\1\0\1\40\5\0"+
|
||||||
|
"\2\11\1\0\2\11\1\61\1\62\4\11\2\36\1\0"+
|
||||||
|
"\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
|
||||||
|
"\1\0\1\11\2\0\1\4\1\53\1\54\1\12\1\11"+
|
||||||
|
"\1\0\1\12\1\41\1\42\2\12\1\11\1\63\1\11"+
|
||||||
|
"\2\36\1\0\1\37\1\0\1\37\1\40\2\0\1\37"+
|
||||||
|
"\1\0\1\22\1\0\1\12\5\0\2\43\1\0\5\43"+
|
||||||
|
"\3\0\2\14\13\0\1\43\5\0\2\44\1\0\5\44"+
|
||||||
|
"\3\11\2\14\1\50\3\0\1\44\4\0\1\22\1\0"+
|
||||||
|
"\1\44\5\0\1\45\1\14\1\0\1\45\1\46\3\45"+
|
||||||
|
"\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
|
||||||
|
"\1\22\1\0\1\45\5\0\2\14\1\0\1\64\4\14"+
|
||||||
|
"\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
|
||||||
|
"\1\22\1\0\1\14\5\0\2\50\1\0\5\50\5\0"+
|
||||||
|
"\1\50\3\0\1\40\6\0\1\50\5\0\2\51\1\0"+
|
||||||
|
"\5\51\3\11\2\36\4\0\1\40\4\0\1\22\1\0"+
|
||||||
|
"\1\51\5\0\2\52\1\0\5\52\16\0\1\51\1\0"+
|
||||||
|
"\1\52\2\0\1\4\2\0\1\53\2\0\1\53\1\65"+
|
||||||
|
"\1\66\2\53\20\0\1\53\5\0\1\54\2\0\1\54"+
|
||||||
|
"\1\65\1\67\2\54\20\0\1\54\2\0\1\4\1\53"+
|
||||||
|
"\1\54\5\0\1\70\3\0\1\54\32\0\1\56\1\71"+
|
||||||
|
"\26\0\1\57\2\0\1\57\1\0\3\57\20\0\1\57"+
|
||||||
|
"\5\0\2\60\1\0\5\60\3\0\2\36\13\0\1\60"+
|
||||||
|
"\2\0\1\4\1\53\1\54\2\11\1\0\2\11\1\72"+
|
||||||
|
"\3\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
|
||||||
|
"\1\40\2\0\1\37\1\0\1\22\1\0\1\11\5\0"+
|
||||||
|
"\2\11\1\0\3\11\1\62\1\73\3\11\2\36\1\0"+
|
||||||
|
"\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
|
||||||
|
"\1\0\1\11\5\0\1\63\1\11\1\0\1\63\1\74"+
|
||||||
|
"\1\75\2\63\3\11\2\36\1\0\1\37\1\0\1\37"+
|
||||||
|
"\1\40\2\0\1\37\1\0\1\22\1\0\1\63\5\0"+
|
||||||
|
"\1\64\1\14\1\0\1\64\1\14\3\64\3\11\2\14"+
|
||||||
|
"\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
|
||||||
|
"\1\64\12\0\1\55\25\0\1\4\1\53\1\54\1\53"+
|
||||||
|
"\2\0\1\53\1\65\1\66\2\53\1\0\1\54\16\0"+
|
||||||
|
"\1\53\2\0\1\4\1\53\2\54\2\0\1\54\1\65"+
|
||||||
|
"\1\67\2\54\1\0\1\54\16\0\1\54\3\0\1\53"+
|
||||||
|
"\1\54\5\0\1\70\3\0\1\54\22\0\1\53\1\54"+
|
||||||
|
"\2\11\1\0\2\11\1\72\3\11\1\63\1\11\2\36"+
|
||||||
|
"\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
|
||||||
|
"\1\22\1\0\1\11\5\0\2\11\1\0\2\11\1\61"+
|
||||||
|
"\5\11\2\36\1\0\1\37\1\0\1\37\1\40\2\0"+
|
||||||
|
"\1\37\1\0\1\22\1\0\1\11\2\0\1\4\1\53"+
|
||||||
|
"\1\54\1\63\1\11\1\0\1\63\1\74\1\75\2\63"+
|
||||||
|
"\1\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
|
||||||
|
"\1\40\2\0\1\37\1\0\1\22\1\0\1\63";
|
||||||
|
|
||||||
private static int [] zzUnpackTrans() {
|
private static int [] zzUnpackTrans() {
|
||||||
int [] result = new int[396];
|
int [] result = new int[1650];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -329,7 +464,7 @@ public final class StandardTokenizerImpl {
|
||||||
|
|
||||||
/* error messages for the codes above */
|
/* error messages for the codes above */
|
||||||
private static final String ZZ_ERROR_MSG[] = {
|
private static final String ZZ_ERROR_MSG[] = {
|
||||||
"Unkown internal scanner error",
|
"Unknown internal scanner error",
|
||||||
"Error: could not match input",
|
"Error: could not match input",
|
||||||
"Error: pushback value was too large"
|
"Error: pushback value was too large"
|
||||||
};
|
};
|
||||||
|
@ -340,11 +475,12 @@ public final class StandardTokenizerImpl {
|
||||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||||
|
|
||||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||||
"\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
|
"\1\0\1\11\22\1\1\0\4\1\1\0\1\1\2\0"+
|
||||||
"\2\1\2\0\1\1\1\0";
|
"\1\1\1\0\3\1\1\0\3\1\1\0\2\1\1\0"+
|
||||||
|
"\2\1\2\0\1\1\1\0\7\1\1\0\1\11\4\1";
|
||||||
|
|
||||||
private static int [] zzUnpackAttribute() {
|
private static int [] zzUnpackAttribute() {
|
||||||
int [] result = new int[24];
|
int [] result = new int[61];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -401,11 +537,11 @@ public final class StandardTokenizerImpl {
|
||||||
private int yycolumn;
|
private int yycolumn;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
* zzAtBOL == true iff the scanner is currently at the beginning of a line
|
||||||
*/
|
*/
|
||||||
private boolean zzAtBOL = true;
|
private boolean zzAtBOL = true;
|
||||||
|
|
||||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
/** zzAtEOF == true iff the scanner is at the EOF */
|
||||||
private boolean zzAtEOF;
|
private boolean zzAtEOF;
|
||||||
|
|
||||||
/** denotes if the user-EOF-code has already been executed */
|
/** denotes if the user-EOF-code has already been executed */
|
||||||
|
@ -447,6 +583,9 @@ public final class StandardTokenizerImpl {
|
||||||
|
|
||||||
/** Hangul token type */
|
/** Hangul token type */
|
||||||
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
|
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
|
||||||
|
|
||||||
|
/** Emoji token type */
|
||||||
|
public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
|
||||||
|
|
||||||
/** Character count processed so far */
|
/** Character count processed so far */
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
|
@ -492,7 +631,7 @@ public final class StandardTokenizerImpl {
|
||||||
char [] map = new char[0x110000];
|
char [] map = new char[0x110000];
|
||||||
int i = 0; /* index in packed string */
|
int i = 0; /* index in packed string */
|
||||||
int j = 0; /* index in unpacked array */
|
int j = 0; /* index in unpacked array */
|
||||||
while (i < 2836) {
|
while (i < 4122) {
|
||||||
int count = packed.charAt(i++);
|
int count = packed.charAt(i++);
|
||||||
char value = packed.charAt(i++);
|
char value = packed.charAt(i++);
|
||||||
do map[j++] = value; while (--count > 0);
|
do map[j++] = value; while (--count > 0);
|
||||||
|
@ -500,6 +639,8 @@ public final class StandardTokenizerImpl {
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------------------- */
|
||||||
|
/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Refills the input buffer.
|
* Refills the input buffer.
|
||||||
|
@ -527,32 +668,45 @@ public final class StandardTokenizerImpl {
|
||||||
|
|
||||||
|
|
||||||
/* fill the buffer with new input */
|
/* fill the buffer with new input */
|
||||||
int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
|
int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
|
||||||
int totalRead = 0;
|
if (requested == 0) {
|
||||||
while (totalRead < requested) {
|
return true;
|
||||||
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
|
|
||||||
if (numRead == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
totalRead += numRead;
|
|
||||||
}
|
}
|
||||||
|
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
|
||||||
|
|
||||||
if (totalRead > 0) {
|
/* not supposed to occur according to specification of java.io.Reader */
|
||||||
zzEndRead += totalRead;
|
if (numRead == 0) {
|
||||||
if (totalRead == requested) { /* possibly more input available */
|
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
|
||||||
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
}
|
||||||
|
if (numRead > 0) {
|
||||||
|
zzEndRead += numRead;
|
||||||
|
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||||
|
if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
|
||||||
--zzEndRead;
|
--zzEndRead;
|
||||||
zzFinalHighSurrogate = 1;
|
zzFinalHighSurrogate = 1;
|
||||||
if (totalRead == 1) { return true; }
|
if (numRead == 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else { // There is room in the buffer for at least one more char
|
||||||
|
int c = zzReader.read(); // Expecting to read a low surrogate char
|
||||||
|
if (c == -1) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
zzBuffer[zzEndRead++] = (char)c;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* potentially more input available */
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// totalRead = 0: End of stream
|
/* numRead < 0 ==> end of stream */
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
|
||||||
|
/* ------------------------------------------------------------------------------ */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Closes the input stream.
|
* Closes the input stream.
|
||||||
|
@ -773,49 +927,62 @@ public final class StandardTokenizerImpl {
|
||||||
// store back cached position
|
// store back cached position
|
||||||
zzMarkedPos = zzMarkedPosL;
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
case 1:
|
zzAtEOF = true;
|
||||||
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
|
||||||
}
|
|
||||||
case 9: break;
|
|
||||||
case 2:
|
|
||||||
{ return WORD_TYPE;
|
|
||||||
}
|
|
||||||
case 10: break;
|
|
||||||
case 3:
|
|
||||||
{ return HANGUL_TYPE;
|
|
||||||
}
|
|
||||||
case 11: break;
|
|
||||||
case 4:
|
|
||||||
{ return NUMERIC_TYPE;
|
|
||||||
}
|
|
||||||
case 12: break;
|
|
||||||
case 5:
|
|
||||||
{ return KATAKANA_TYPE;
|
|
||||||
}
|
|
||||||
case 13: break;
|
|
||||||
case 6:
|
|
||||||
{ return IDEOGRAPHIC_TYPE;
|
|
||||||
}
|
|
||||||
case 14: break;
|
|
||||||
case 7:
|
|
||||||
{ return HIRAGANA_TYPE;
|
|
||||||
}
|
|
||||||
case 15: break;
|
|
||||||
case 8:
|
|
||||||
{ return SOUTH_EAST_ASIAN_TYPE;
|
|
||||||
}
|
|
||||||
case 16: break;
|
|
||||||
default:
|
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
|
||||||
zzAtEOF = true;
|
|
||||||
{
|
{
|
||||||
return YYEOF;
|
return YYEOF;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||||
|
case 1:
|
||||||
|
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 10: break;
|
||||||
|
case 2:
|
||||||
|
{ return EMOJI_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 11: break;
|
||||||
|
case 3:
|
||||||
|
{ return WORD_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 12: break;
|
||||||
|
case 4:
|
||||||
|
{ return HANGUL_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 13: break;
|
||||||
|
case 5:
|
||||||
|
{ return NUMERIC_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 14: break;
|
||||||
|
case 6:
|
||||||
|
{ return KATAKANA_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 15: break;
|
||||||
|
case 7:
|
||||||
|
{ return IDEOGRAPHIC_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 16: break;
|
||||||
|
case 8:
|
||||||
|
{ return HIRAGANA_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 17: break;
|
||||||
|
case 9:
|
||||||
|
{ return SOUTH_EAST_ASIAN_TYPE;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
case 18: break;
|
||||||
|
default:
|
||||||
zzScanError(ZZ_NO_MATCH);
|
zzScanError(ZZ_NO_MATCH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,12 +34,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
* <li><HIRAGANA>: A single hiragana character</li>
|
* <li><HIRAGANA>: A single hiragana character</li>
|
||||||
* <li><KATAKANA>: A sequence of katakana characters</li>
|
* <li><KATAKANA>: A sequence of katakana characters</li>
|
||||||
* <li><HANGUL>: A sequence of Hangul characters</li>
|
* <li><HANGUL>: A sequence of Hangul characters</li>
|
||||||
|
* <li><EMOJI>: A sequence of Emoji characters</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("fallthrough")
|
@SuppressWarnings("fallthrough")
|
||||||
%%
|
%%
|
||||||
|
|
||||||
%unicode 6.3
|
%unicode 9.0
|
||||||
%integer
|
%integer
|
||||||
%final
|
%final
|
||||||
%public
|
%public
|
||||||
|
@ -48,22 +49,67 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
%char
|
%char
|
||||||
%buffer 255
|
%buffer 255
|
||||||
|
|
||||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
|
||||||
|
|
||||||
|
// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
|
||||||
|
%include ../../../../../../data/jflex/UnicodeEmojiProperties.jflex
|
||||||
|
|
||||||
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
//
|
//
|
||||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
|
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
||||||
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
|
// - are explicitly excluded here so that we can properly handle Emoji sequences.
|
||||||
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
|
//
|
||||||
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
|
ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
|
||||||
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
KeyCapBaseChar = [0-9#*]
|
||||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
|
||||||
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
|
KeyCap = \u20E3
|
||||||
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
|
KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
|
||||||
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
||||||
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
|
AccidentalEmoji = [©®™\u3030\u303D]
|
||||||
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
|
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
|
||||||
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
|
|
||||||
|
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
|
||||||
|
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
|
||||||
|
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
|
||||||
|
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
|
||||||
|
|
||||||
|
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
|
||||||
|
|
||||||
|
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||||
|
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||||
|
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||||
|
|
||||||
|
EmojiPresentationSelector = \uFE0F
|
||||||
|
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
||||||
|
TagSpec = [\u{E0020}-\u{E007E}]
|
||||||
|
TagTerm = \u{E007F}
|
||||||
|
|
||||||
|
// End Emoji Macros
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
|
//
|
||||||
|
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
|
||||||
|
|
||||||
|
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
|
||||||
|
AHLetterEx = [\p{WB:ALetter}\p{WB:Hebrew_Letter}] {ExtFmtZwj}
|
||||||
|
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] {ExtFmtZwj}
|
||||||
|
KatakanaEx = \p{WB:Katakana} {ExtFmtZwj}
|
||||||
|
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
|
||||||
|
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
|
||||||
|
ExtendNumLetEx = \p{WB:ExtendNumLet} {ExtFmtZwj}
|
||||||
|
HanEx = \p{Script:Han} {ExtFmtZwj}
|
||||||
|
HiraganaEx = \p{Script:Hiragana} {ExtFmtZwj}
|
||||||
|
SingleQuoteEx = \p{WB:Single_Quote} {ExtFmtZwj}
|
||||||
|
DoubleQuoteEx = \p{WB:Double_Quote} {ExtFmtZwj}
|
||||||
|
HebrewLetterEx = \p{WB:Hebrew_Letter} {ExtFmtZwj}
|
||||||
|
RegionalIndicatorEx = \p{WB:Regional_Indicator} {ExtFmtZwj}
|
||||||
|
ComplexContextEx = \p{LB:Complex_Context} {ExtFmtZwj}
|
||||||
|
|
||||||
%{
|
%{
|
||||||
/** Alphanumeric sequences */
|
/** Alphanumeric sequences */
|
||||||
|
@ -93,6 +139,9 @@ ComplexContextEx = \p{LB:Complex_Context}
|
||||||
|
|
||||||
/** Hangul token type */
|
/** Hangul token type */
|
||||||
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
|
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
|
||||||
|
|
||||||
|
/** Emoji token type */
|
||||||
|
public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
|
||||||
|
|
||||||
/** Character count processed so far */
|
/** Character count processed so far */
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
|
@ -120,18 +169,64 @@ ComplexContextEx = \p{LB:Complex_Context}
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
// UAX#29 WB1. sot ÷
|
// UAX#29 WB1. sot ÷ Any
|
||||||
// WB2. ÷ eot
|
// WB2. Any ÷ eot
|
||||||
//
|
//
|
||||||
<<EOF>> { return YYEOF; }
|
<<EOF>> { return YYEOF; }
|
||||||
|
|
||||||
// UAX#29 WB8. Numeric × Numeric
|
// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
|
||||||
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
// WB14. (E_Base | EBG) × E_Modifier
|
||||||
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
// WB15. ^ (RI RI)* RI × RI
|
||||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
// WB16. [^RI] (RI RI)* RI × RI
|
||||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
|
// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
|
||||||
|
// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
|
||||||
|
//
|
||||||
|
// emoji_sequence :=
|
||||||
|
// Top-level EBNF Expanded #1 Expanded #2 Expanded #3
|
||||||
|
// --------------------- ---------------------------- ----------------------------- ----------------------------------------------
|
||||||
|
// emoji_core_sequence emoji_combining_sequence emoji_character ( \p{Emoji}
|
||||||
|
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
|
||||||
|
// | emoji_keycap_sequence | [0-9#*] \u{FE0F 20E3} [1]
|
||||||
|
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
|
||||||
|
// | emoji_flag_sequence | \p{WB:Regional_Indicator}{2} )
|
||||||
|
//
|
||||||
|
// | emoji_zwj_sequence emoji_zwj_element emoji_character ( \p{Emoji}
|
||||||
|
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
|
||||||
|
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
|
||||||
|
// ( ZWJ emoji_zwj_element )+ ( \p{WB:ZWJ} ^^ )+
|
||||||
|
//
|
||||||
|
// | emoji_tag_sequence tag_base emoji_character ( \p{Emoji}
|
||||||
|
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
|
||||||
|
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
|
||||||
|
// tag_spec [\u{E0020}-\u{E007E}]+
|
||||||
|
// tag_term \u{E007F}
|
||||||
|
//
|
||||||
|
// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences
|
||||||
|
// WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
|
||||||
|
// TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
|
||||||
|
// choose whether to support them for segmentation. This implementation will
|
||||||
|
// recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji.
|
||||||
|
//
|
||||||
|
// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
|
||||||
|
// https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
|
||||||
|
//
|
||||||
|
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
|
||||||
|
//
|
||||||
|
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
|
||||||
|
//
|
||||||
|
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
|
||||||
|
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
|
||||||
|
| {RegionalIndicatorEx}{2}
|
||||||
|
{ return EMOJI_TYPE; }
|
||||||
|
|
||||||
|
// UAX#29 WB8. Numeric × Numeric
|
||||||
|
// WB11. Numeric (MidNum | MidNumLetQ) × Numeric
|
||||||
|
// WB12. Numeric × (MidNum | MidNumLetQ) Numeric
|
||||||
|
// WB13a. (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
|
// WB13b. ExtendNumLet × (AHLetter | Numeric | Katakana)
|
||||||
|
//
|
||||||
|
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
|
||||||
{ return NUMERIC_TYPE; }
|
{ return NUMERIC_TYPE; }
|
||||||
|
|
||||||
// subset of the below for typing purposes only!
|
// subset of the below for typing purposes only!
|
||||||
|
@ -141,28 +236,28 @@ ComplexContextEx = \p{LB:Complex_Context}
|
||||||
{KatakanaEx}+
|
{KatakanaEx}+
|
||||||
{ return KATAKANA_TYPE; }
|
{ return KATAKANA_TYPE; }
|
||||||
|
|
||||||
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
// UAX#29 WB5. AHLetter × AHLetter
|
||||||
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
// WB6. AHLetter × (MidLetter | MidNumLetQ) AHLetter
|
||||||
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
// WB7. AHLetter (MidLetter | MidNumLetQ) × AHLetter
|
||||||
// WB7a. Hebrew_Letter × Single_Quote
|
// WB7a. Hebrew_Letter × Single_Quote
|
||||||
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
||||||
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
||||||
// WB9. (ALetter | Hebrew_Letter) × Numeric
|
// WB9. AHLetter × Numeric
|
||||||
// WB10. Numeric × (ALetter | Hebrew_Letter)
|
// WB10. Numeric × AHLetter
|
||||||
// WB13. Katakana × Katakana
|
// WB13. Katakana × Katakana
|
||||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
)*
|
)*
|
||||||
|
@ -172,13 +267,13 @@ ComplexContextEx = \p{LB:Complex_Context}
|
||||||
|
|
||||||
// From UAX #29:
|
// From UAX #29:
|
||||||
//
|
//
|
||||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||||
// boundary property values based on criteria outside of the scope of this
|
// boundary property values based on criteria outside of the scope of this
|
||||||
// annex. That means that satisfactory treatment of languages like Chinese
|
// annex. That means that satisfactory treatment of languages like Chinese
|
||||||
// or Thai requires special handling.
|
// or Thai requires special handling.
|
||||||
//
|
//
|
||||||
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
|
// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||||
//
|
//
|
||||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||||
|
@ -191,17 +286,14 @@ ComplexContextEx = \p{LB:Complex_Context}
|
||||||
//
|
//
|
||||||
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||||
|
|
||||||
// UAX#29 WB14. Any ÷ Any
|
// UAX#29 WB999. Any ÷ Any
|
||||||
//
|
//
|
||||||
{HanEx} { return IDEOGRAPHIC_TYPE; }
|
{HanEx} { return IDEOGRAPHIC_TYPE; }
|
||||||
{HiraganaEx} { return HIRAGANA_TYPE; }
|
{HiraganaEx} { return HIRAGANA_TYPE; }
|
||||||
|
|
||||||
|
// UAX#29 WB3. CR × LF
|
||||||
// UAX#29 WB3. CR × LF
|
// WB3a. (Newline | CR | LF) ÷
|
||||||
// WB3a. (Newline | CR | LF) ÷
|
// WB3b. ÷ (Newline | CR | LF)
|
||||||
// WB3b. ÷ (Newline | CR | LF)
|
// WB999. Any ÷ Any
|
||||||
// WB13c. Regional_Indicator × Regional_Indicator
|
|
||||||
// WB14. Any ÷ Any
|
|
||||||
//
|
//
|
||||||
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
|
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
|
||||||
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
||||||
|
|
|
@ -18,8 +18,11 @@ package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -27,6 +30,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
@ -282,7 +286,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnicodeWordBreaks() throws Exception {
|
public void testUnicodeWordBreaks() throws Exception {
|
||||||
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
|
WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
|
||||||
wordBreakTest.test(a);
|
wordBreakTest.test(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -358,8 +362,80 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** simple emoji */
|
||||||
|
public void testEmoji() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
|
||||||
|
new String[] { "💩", "💩", "💩" },
|
||||||
|
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** emoji zwj sequence */
|
||||||
|
public void testEmojiSequence() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
|
||||||
|
new String[] { "👩❤️👩" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** emoji zwj sequence with fitzpatrick modifier */
|
||||||
|
public void testEmojiSequenceWithModifier() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
|
||||||
|
new String[] { "👨🏼⚕️" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** regional indicator */
|
||||||
|
public void testEmojiRegionalIndicator() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
|
||||||
|
new String[] { "🇺🇸", "🇺🇸" },
|
||||||
|
new String[] { "<EMOJI>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** variation sequence */
|
||||||
|
public void testEmojiVariationSequence() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
|
||||||
|
new String[] { "#️⃣" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
|
||||||
|
new String[] { "3️⃣",},
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
|
||||||
|
// text presentation sequences
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
|
||||||
|
new String[] { },
|
||||||
|
new String[] { });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
|
||||||
|
new String[] { "3\uFE0E",},
|
||||||
|
new String[] { "<NUM>" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
|
||||||
|
new String[] { "\u2B55",},
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
|
||||||
|
new String[] { "\u2B55", "\u200D\u2B55"},
|
||||||
|
new String[] { "<EMOJI>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmojiTagSequence() throws Exception {
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
|
||||||
|
new String[] { "🏴" },
|
||||||
|
new String[] { "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmojiTokenization() throws Exception {
|
||||||
|
// simple emoji around latin
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
|
||||||
|
new String[] { "poo", "💩", "poo" },
|
||||||
|
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
|
||||||
|
// simple emoji around non-latin
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
|
||||||
|
new String[] { "💩", "中", "國", "💩" },
|
||||||
|
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUnicodeEmojiTests() throws Exception {
|
||||||
|
EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
|
||||||
|
emojiTest.test(a);
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
Analyzer analyzer = new StandardAnalyzer();
|
Analyzer analyzer = new StandardAnalyzer();
|
||||||
|
@ -416,4 +492,53 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
|
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
|
||||||
a.close();
|
a.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSplitSurrogatePairWithSpoonFeedReader() throws Exception {
|
||||||
|
String text = "12345678\ud800\udf00"; // U+D800 U+DF00 = U+10300 = 𐌀 (OLD ITALIC LETTER A)
|
||||||
|
|
||||||
|
// Collect tokens with normal reader
|
||||||
|
StandardAnalyzer a = new StandardAnalyzer();
|
||||||
|
TokenStream ts = a.tokenStream("dummy", text);
|
||||||
|
List<String> tokens = new ArrayList<>();
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
tokens.add(termAtt.toString());
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
|
||||||
|
// Tokens from a spoon-feed reader should be the same as from a normal reader
|
||||||
|
// The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the surrogate pair at a read boundary
|
||||||
|
Reader reader = new SpoonFeedMaxCharsReaderWrapper(9, new StringReader(text));
|
||||||
|
ts = a.tokenStream("dummy", reader);
|
||||||
|
termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
for (int tokenNum = 0 ; ts.incrementToken() ; ++tokenNum) {
|
||||||
|
assertEquals("token #" + tokenNum + " mismatch: ", termAtt.toString(), tokens.get(tokenNum));
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SpoonFeedMaxCharsReaderWrapper extends Reader {
|
||||||
|
private final Reader in;
|
||||||
|
private final int maxChars;
|
||||||
|
|
||||||
|
public SpoonFeedMaxCharsReaderWrapper(int maxChars, Reader in) {
|
||||||
|
this.in = in;
|
||||||
|
this.maxChars = maxChars;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the configured number of chars if available */
|
||||||
|
@Override
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
return in.read(cbuf, off, Math.min(maxChars, len));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,150 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
use File::Spec;
|
||||||
|
use Getopt::Long;
|
||||||
|
use LWP::UserAgent;
|
||||||
|
|
||||||
|
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
|
||||||
|
|
||||||
|
my $version = '';
|
||||||
|
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
|
||||||
|
print STDERR "Usage: $script_name -v <version>\n";
|
||||||
|
print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n"
|
||||||
|
if ($version);
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt";
|
||||||
|
my $underscore_version = $version;
|
||||||
|
$underscore_version =~ s/\./_/g;
|
||||||
|
my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}";
|
||||||
|
my $output_filename = "${class_name}.java";
|
||||||
|
my $header =<<"__HEADER__";
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class was automatically generated by ${script_name}
|
||||||
|
* from: ${url}
|
||||||
|
*
|
||||||
|
* emoji-test.txt contains emoji char sequences, which are represented as
|
||||||
|
* tokenization tests in this class.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
\@Ignore
|
||||||
|
public class ${class_name} extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void test(Analyzer analyzer) throws Exception {
|
||||||
|
for (int i = 0 ; i < tests.length ; i += 2) {
|
||||||
|
String test = tests[i + 1];
|
||||||
|
try {
|
||||||
|
assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
|
||||||
|
} catch (Throwable t) {
|
||||||
|
throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String[] tests = new String[] {
|
||||||
|
__HEADER__
|
||||||
|
|
||||||
|
my @tests = split /\r?\n/, get_URL_content($url);
|
||||||
|
|
||||||
|
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
||||||
|
open OUT, ">$output_path"
|
||||||
|
|| die "Error opening '$output_path' for writing: $!";
|
||||||
|
|
||||||
|
print STDERR "Writing '$output_path'...";
|
||||||
|
|
||||||
|
print OUT $header;
|
||||||
|
|
||||||
|
my $isFirst = 1;
|
||||||
|
for my $line (@tests) {
|
||||||
|
next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
|
||||||
|
|
||||||
|
print OUT ",\n\n" unless $isFirst;
|
||||||
|
$isFirst = 0;
|
||||||
|
|
||||||
|
# Example line: 1F46E 1F3FB 200D 2642 FE0F ; fully-qualified # 👮🏻♂️ man police officer: light skin tone
|
||||||
|
$line =~ s/\s+$//; # Trim trailing whitespace
|
||||||
|
$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
|
||||||
|
print OUT " \"$line\",\n";
|
||||||
|
my ($test_string) = $line =~ /^(.*?)\s*;/;
|
||||||
|
$test_string =~ s/([0-9A-F]+)/\\u$1/g;
|
||||||
|
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
|
||||||
|
$test_string =~ s/\s//g;
|
||||||
|
print OUT " \"${test_string}\"";
|
||||||
|
}
|
||||||
|
print OUT " };\n}\n";
|
||||||
|
close OUT;
|
||||||
|
print STDERR "done.\n";
|
||||||
|
|
||||||
|
|
||||||
|
# sub above_BMP_char_to_surrogates
|
||||||
|
#
|
||||||
|
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
|
||||||
|
# to the corresponding UTF-16 surrogate pair
|
||||||
|
#
|
||||||
|
# Assumption: input string is a sequence more than four hex digits
|
||||||
|
#
|
||||||
|
sub above_BMP_char_to_surrogates {
|
||||||
|
my $ch = hex(shift);
|
||||||
|
my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
|
||||||
|
my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
|
||||||
|
return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# sub get_URL_content
|
||||||
|
#
|
||||||
|
# Retrieves and returns the content of the given URL.
|
||||||
|
#
|
||||||
|
sub get_URL_content {
|
||||||
|
my $url = shift;
|
||||||
|
print STDERR "Retrieving '$url'...";
|
||||||
|
my $user_agent = LWP::UserAgent->new;
|
||||||
|
my $request = HTTP::Request->new(GET => $url);
|
||||||
|
my $response = $user_agent->request($request);
|
||||||
|
unless ($response->is_success) {
|
||||||
|
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
print STDERR "done.\n";
|
||||||
|
return $response->content;
|
||||||
|
}
|
|
@ -40,8 +40,6 @@ $underscore_version =~ s/\./_/g;
|
||||||
my $class_name = "WordBreakTestUnicode_${underscore_version}";
|
my $class_name = "WordBreakTestUnicode_${underscore_version}";
|
||||||
my $output_filename = "${class_name}.java";
|
my $output_filename = "${class_name}.java";
|
||||||
my $header =<<"__HEADER__";
|
my $header =<<"__HEADER__";
|
||||||
package org.apache.lucene.analysis;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -59,6 +57,8 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
|
@ -81,7 +81,7 @@ import org.junit.Ignore;
|
||||||
* \\p{WordBreak = Hebrew_Letter}
|
* \\p{WordBreak = Hebrew_Letter}
|
||||||
* \\p{WordBreak = Katakana}
|
* \\p{WordBreak = Katakana}
|
||||||
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
|
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
|
||||||
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
|
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
|
||||||
*/
|
*/
|
||||||
\@Ignore
|
\@Ignore
|
||||||
public class ${class_name} extends BaseTokenStreamTestCase {
|
public class ${class_name} extends BaseTokenStreamTestCase {
|
||||||
|
@ -91,6 +91,7 @@ __HEADER__
|
||||||
|
|
||||||
my $codepoints = [];
|
my $codepoints = [];
|
||||||
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
|
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
|
||||||
|
my $regional_indicator_codepoints = [];
|
||||||
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
|
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
|
||||||
# Using lowercase versions of property value names to allow for case-
|
# Using lowercase versions of property value names to allow for case-
|
||||||
# insensitive comparison with the names in the Unicode data files.
|
# insensitive comparison with the names in the Unicode data files.
|
||||||
|
@ -98,7 +99,9 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
|
||||||
parse_Unicode_data_file($scripts_url, $codepoints,
|
parse_Unicode_data_file($scripts_url, $codepoints,
|
||||||
{'han' => 1, 'hiragana' => 1});
|
{'han' => 1, 'hiragana' => 1});
|
||||||
parse_Unicode_data_file($word_break_url, $codepoints,
|
parse_Unicode_data_file($word_break_url, $codepoints,
|
||||||
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
|
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
|
||||||
|
'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
|
||||||
|
parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
|
||||||
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
|
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
|
||||||
|
|
||||||
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
||||||
|
@ -124,10 +127,21 @@ for my $line (@tests) {
|
||||||
$test_string =~ s/\\u000D/\\r/g;
|
$test_string =~ s/\\u000D/\\r/g;
|
||||||
$test_string =~ s/\\u0022/\\\"/g;
|
$test_string =~ s/\\u0022/\\\"/g;
|
||||||
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
|
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
|
||||||
|
|
||||||
|
# TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
|
||||||
|
# ÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
|
||||||
|
if ($sequence =~ /^200D\s*÷\s*261D$/) {
|
||||||
|
print OUT " // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
my @tokens = ();
|
my @tokens = ();
|
||||||
|
my $isfirst = 0;
|
||||||
for my $candidate (split /\s*÷\s*/, $sequence) {
|
for my $candidate (split /\s*÷\s*/, $sequence) {
|
||||||
|
$isfirst = 1;
|
||||||
my @chars = ();
|
my @chars = ();
|
||||||
my $has_wanted_char = 0;
|
my $has_wanted_chars = 0;
|
||||||
|
my $prev_char_regional_indicator = 0;
|
||||||
while ($candidate =~ /([0-9A-F]+)/gi) {
|
while ($candidate =~ /([0-9A-F]+)/gi) {
|
||||||
my $hexchar = $1;
|
my $hexchar = $1;
|
||||||
if (4 == length($hexchar)) {
|
if (4 == length($hexchar)) {
|
||||||
|
@ -135,12 +149,21 @@ for my $line (@tests) {
|
||||||
} else {
|
} else {
|
||||||
push @chars, above_BMP_char_to_surrogates($hexchar);
|
push @chars, above_BMP_char_to_surrogates($hexchar);
|
||||||
}
|
}
|
||||||
unless ($has_wanted_char) {
|
unless ($has_wanted_chars) {
|
||||||
$has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
|
my $codepoint = hex($hexchar);
|
||||||
|
if (defined($codepoints->[$codepoint])) {
|
||||||
|
$has_wanted_chars = 1;
|
||||||
|
} elsif (defined($regional_indicator_codepoints->[$codepoint])) {
|
||||||
|
if (1 == $prev_char_regional_indicator) {
|
||||||
|
$has_wanted_chars = 1; # must be 2 regional indicators in a row
|
||||||
|
} else {
|
||||||
|
$prev_char_regional_indicator = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ($has_wanted_char) {
|
if ($has_wanted_chars) {
|
||||||
push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
|
push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
|
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
|
||||||
|
|
Loading…
Reference in New Issue