LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '<EMOJI>' token type.

This commit is contained in:
Steve Rowe 2019-01-08 13:33:49 -05:00
parent 7db4121b45
commit 283b19a8da
24 changed files with 57040 additions and 35638 deletions

View File

@ -241,6 +241,11 @@ Optimizations
======================= Lucene 7.7.0 =======================
Changes in Runtime Behavior
* LUCENE-8527: StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
and provide Unicode UTS#51 v11.0 Emoji tokenization with the "<EMOJI>" token type.
Build
* LUCENE-8611: Update randomizedtesting to 2.7.2, JUnit to 4.12, add hamcrest-core
@ -293,6 +298,9 @@ Improvements
* LUCENE-8581: Change LatLonShape encoding to use 4 bytes Per Dimension.
(Ignacio Vera, Nick Knize, Adrien Grand)
* LUCENE-8527: Upgrade JFlex dependency to 1.7.0; in StandardTokenizer and UAX29URLEmailTokenizer,
increase supported Unicode version from 6.3 to 9.0, and support Unicode UTS#51 v11.0 Emoji tokenization.
Optimizations

View File

@ -33,18 +33,14 @@
<property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
-jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
<!-- Because of a bug in JFlex's ant task, HTMLStripCharFilter has to be generated last. -->
<!-- Otherwise the "%apiprivate" option used in its specification will leak into following -->
<!-- ant task invocations. -->
<target name="jflex" depends="init,clean-jflex,-jflex-wiki-tokenizer,-jflex-ClassicAnalyzer,
-jflex-UAX29URLEmailTokenizer,-jflex-HTMLStripCharFilter"/>
<target name="-jflex-HTMLStripCharFilter"
depends="init,generate-jflex-html-char-entities">
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
outdir="src/java/org/apache/lucene/analysis/charfilter"
nobak="on" inputstreamctor="false"/>
<!-- Remove the inappropriate JFlex-generated constructor -->
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
replace="" flags="s"/>
<target name="-jflex-HTMLStripCharFilter" depends="-install-jflex,generate-jflex-html-char-entities">
<run-jflex dir="src/java/org/apache/lucene/analysis/charfilter" name="HTMLStripCharFilter"/>
</target>
<target name="generate-jflex-html-char-entities">
@ -58,17 +54,17 @@
<fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
</target>
<target name="-jflex-wiki-tokenizer" depends="init,-install-jflex">
<target name="-jflex-wiki-tokenizer" depends="-install-jflex">
<run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
</target>
<target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
<run-jflex-and-disable-buffer-expansion
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
<target name="-jflex-ClassicAnalyzer" depends="-install-jflex">
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
</target>
<target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
<run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
<target name="-jflex-UAX29URLEmailTokenizer" depends="-install-jflex">
<run-jflex-and-disable-buffer-expansion
dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
</target>
<target name="clean-jflex">

View File

@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
@SuppressWarnings("fallthrough")
%%
%unicode 6.3
%unicode 9.0
%apiprivate
%type int
%final
@ -50,6 +50,10 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
%xstate STYLE, STYLE_COMMENT
%init{
super(in);
%init}
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
@ -165,25 +169,15 @@ InlineElment = ( [aAbBiIqQsSuU] |
private TextSegment outputSegment = inputSegment;
private TextSegment entitySegment = new TextSegment(2);
/**
* Creates a new HTMLStripCharFilter over the provided Reader.
* @param source Reader to strip html tags from.
*/
public HTMLStripCharFilter(Reader source) {
super(source);
this.zzReader = source;
}
/**
* Creates a new HTMLStripCharFilter over the provided Reader
* with the specified start and end tags.
* @param source Reader to strip html tags from.
* @param in Reader to strip html tags from.
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
super(source);
this.zzReader = source;
public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
this(in);
if (null != escapedTags) {
for (String tag : escapedTags) {
if (tag.equalsIgnoreCase("BR")) {

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.6.0 */
/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -251,7 +251,7 @@ class ClassicTokenizerImpl {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@ -323,11 +323,11 @@ class ClassicTokenizerImpl {
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
* zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
/** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@ -436,28 +436,29 @@ public final void getText(CharTermAttribute t) {
}
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
if (numRead == -1) {
break;
}
totalRead += numRead;
}
int requested = zzBuffer.length - zzEndRead;
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
if (totalRead > 0) {
zzEndRead += totalRead;
if (totalRead == requested) { /* possibly more input available */
/* not supposed to occur according to specification of java.io.Reader */
if (numRead == 0) {
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
}
if (numRead > 0) {
zzEndRead += numRead;
/* If numRead == requested, we might have requested to few chars to
encode a full Unicode character. We assume that a Reader would
otherwise never return half characters. */
if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
/* potentially more input available */
return false;
}
// totalRead = 0: End of stream
/* numRead < 0 ==> end of stream */
return true;
}
@ -681,55 +682,65 @@ public final void getText(CharTermAttribute t) {
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 11: break;
case 2:
{ return ALPHANUM;
}
case 12: break;
case 3:
{ return CJ;
}
case 13: break;
case 4:
{ return HOST;
}
case 14: break;
case 5:
{ return NUM;
}
case 15: break;
case 6:
{ return APOSTROPHE;
}
case 16: break;
case 7:
{ return COMPANY;
}
case 17: break;
case 8:
{ return ACRONYM_DEP;
}
case 18: break;
case 9:
{ return ACRONYM;
}
case 19: break;
case 10:
{ return EMAIL;
}
case 20: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
}
else {
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
}
else {
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
// fall through
case 11: break;
case 2:
{ return ALPHANUM;
}
// fall through
case 12: break;
case 3:
{ return CJ;
}
// fall through
case 13: break;
case 4:
{ return HOST;
}
// fall through
case 14: break;
case 5:
{ return NUM;
}
// fall through
case 15: break;
case 6:
{ return APOSTROPHE;
}
// fall through
case 16: break;
case 7:
{ return COMPANY;
}
// fall through
case 17: break;
case 8:
{ return ACRONYM_DEP;
}
// fall through
case 18: break;
case 9:
{ return ACRONYM;
}
// fall through
case 19: break;
case 10:
{ return EMAIL;
}
// fall through
case 20: break;
default:
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -32,33 +32,32 @@ import org.apache.lucene.util.AttributeFactory;
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
* <p>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
*/
public final class UAX29URLEmailTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private final UAX29URLEmailTokenizerImpl scanner;
public static final int ALPHANUM = 0;
public static final int NUM = 1;
public static final int SOUTHEAST_ASIAN = 2;
public static final int IDEOGRAPHIC = 3;
public static final int HIRAGANA = 4;
public static final int KATAKANA = 5;
public static final int HANGUL = 6;
public static final int URL = 7;
public static final int EMAIL = 8;
/** Alpha/numeric token type */
public static final int ALPHANUM = 0;
/** Numeric token type */
public static final int NUM = 1;
/** Southeast Asian token type */
public static final int SOUTHEAST_ASIAN = 2;
/** Ideographic token type */
public static final int IDEOGRAPHIC = 3;
/** Hiragana token type */
public static final int HIRAGANA = 4;
/** Katakana token type */
public static final int KATAKANA = 5;
/** Hangul token type */
public static final int HANGUL = 6;
/** URL token type */
public static final int URL = 7;
/** Email token type */
public static final int EMAIL = 8;
/** Emoji token type. */
public static final int EMOJI = 9;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
@ -71,6 +70,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
"<URL>",
"<EMAIL>",
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]
};
/** Absolute maximum sized token */

View File

@ -37,12 +37,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
%%
%unicode 6.3
%unicode 9.0
%integer
%final
%public
@ -52,22 +53,73 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%xstate AVOID_BAD_URL
%buffer 255
// UAX#29 WB4. X (Extend | Format)* --> X
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
//////////////////////////////////////////////////////////////////////////
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
%include ../../../../../../../../../core/src/data/jflex/UnicodeEmojiProperties.jflex
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
// - are explicitly excluded here so that we can properly handle Emoji sequences.
//
ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
KeyCapBaseChar = [0-9#*]
KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
KeyCap = \u20E3
KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
AccidentalEmoji = [©®™\u3030\u303D]
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiPresentationSelector = \uFE0F
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
TagSpec = [\u{E0020}-\u{E007E}]
TagTerm = \u{E007F}
// End Emoji Macros
//////////////////////////////////////////////////////////////////////////
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
AHLetterEx = [\p{WB:ALetter}\p{WB:Hebrew_Letter}] {ExtFmtZwj}
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] {ExtFmtZwj}
KatakanaEx = \p{WB:Katakana} {ExtFmtZwj}
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
ExtendNumLetEx = \p{WB:ExtendNumLet} {ExtFmtZwj}
HanEx = \p{Script:Han} {ExtFmtZwj}
HiraganaEx = \p{Script:Hiragana} {ExtFmtZwj}
SingleQuoteEx = \p{WB:Single_Quote} {ExtFmtZwj}
DoubleQuoteEx = \p{WB:Double_Quote} {ExtFmtZwj}
HebrewLetterEx = \p{WB:Hebrew_Letter} {ExtFmtZwj}
RegionalIndicatorEx = \p{WB:Regional_Indicator} {ExtFmtZwj}
ComplexContextEx = \p{LB:Complex_Context} {ExtFmtZwj}
// URL and E-mail syntax specifications:
//
@ -174,18 +226,28 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
*/
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
/** Ideographic token type */
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
/** Hiragana token type */
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
/** Katakana token type */
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
/** Hangul token type */
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
/** Email token type */
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
/** URL token type */
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
/** Emoji token type */
public static final int EMOJI_TYPE = UAX29URLEmailTokenizer.EMOJI;
/** Character count processed so far */
public final int yychar()
{
return yychar;
@ -213,11 +275,11 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
<YYINITIAL, AVOID_BAD_URL> {
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
// UAX#29 WB1. sot ÷ Any
// WB2. Any ÷ eot
//
<<EOF>> { return YYEOF; }
{URL} { yybegin(YYINITIAL); return URL_TYPE; }
// LUCENE-5391: Don't recognize no-scheme domain-only URLs with a following alphanumeric character
@ -244,14 +306,61 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{EMAIL} { yybegin(YYINITIAL); return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
// WB14. (E_Base | EBG) × E_Modifier
// WB15. ^ (RI RI)* RI × RI
// WB16. [^RI] (RI RI)* RI × RI
//
// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
//
// emoji_sequence :=
// Top-level EBNF Expanded #1 Expanded #2 Expanded #3
// --------------------- ---------------------------- ----------------------------- ----------------------------------------------
// emoji_core_sequence emoji_combining_sequence emoji_character ( \p{Emoji}
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
// | emoji_keycap_sequence | [0-9#*] \u{FE0F 20E3} [1]
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
// | emoji_flag_sequence | \p{WB:Regional_Indicator}{2} )
//
// | emoji_zwj_sequence emoji_zwj_element emoji_character ( \p{Emoji}
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
// ( ZWJ emoji_zwj_element )+ ( \p{WB:ZWJ} ^^ )+
//
// | emoji_tag_sequence tag_base emoji_character ( \p{Emoji}
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
// tag_spec [\u{E0020}-\u{E007E}]+
// tag_term \u{E007F}
//
// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences
// WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
// TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
// choose whether to support them for segmentation. This implementation will
// recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji.
//
// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
// https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
//
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
//
// WB3c ZWJ × (Extended_Pictographic | EmojiNRK)
//
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
| {RegionalIndicatorEx}{2}
{ yybegin(YYINITIAL); return EMOJI_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLetQ) × Numeric
// WB12. Numeric × (MidNum | MidNumLetQ) Numeric
// WB13a. (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (AHLetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ yybegin(YYINITIAL); return NUMERIC_TYPE; }
{ yybegin(YYINITIAL); return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
@ -260,32 +369,32 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{KatakanaEx}+
{ yybegin(YYINITIAL); return KATAKANA_TYPE; }
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
// UAX#29 WB5. AHLetter × AHLetter
// WB6. AHLetter × (MidLetter | MidNumLetQ) AHLetter
// WB7. AHLetter (MidLetter | MidNumLetQ) × AHLetter
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. AHLetter × Numeric
// WB10. Numeric × AHLetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
)*
{ExtendNumLetEx}*
{ExtendNumLetEx}*
{ yybegin(YYINITIAL); return WORD_TYPE; }
@ -297,7 +406,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -310,18 +419,15 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
//
{ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
// UAX#29 WB999. Any ÷ Any
//
{HanEx} { yybegin(YYINITIAL); return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { yybegin(YYINITIAL); return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB999. Any ÷ Any
//
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ yybegin(YYINITIAL); /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.6.0 */
/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -341,7 +341,7 @@ class WikipediaTokenizerImpl {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@ -419,11 +419,11 @@ class WikipediaTokenizerImpl {
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
* zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
/** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@ -575,28 +575,29 @@ final void reset() {
}
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
if (numRead == -1) {
break;
}
totalRead += numRead;
}
int requested = zzBuffer.length - zzEndRead;
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
if (totalRead > 0) {
zzEndRead += totalRead;
if (totalRead == requested) { /* possibly more input available */
/* not supposed to occur according to specification of java.io.Reader */
if (numRead == 0) {
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
}
if (numRead > 0) {
zzEndRead += numRead;
/* If numRead == requested, we might have requested to few chars to
encode a full Unicode character. We assume that a Reader would
otherwise never return half characters. */
if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
/* potentially more input available */
return false;
}
// totalRead = 0: End of stream
/* numRead < 0 ==> end of stream */
return true;
}
@ -820,199 +821,245 @@ final void reset() {
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
case 47: break;
case 2:
{ positionInc = 1; return ALPHANUM;
}
case 48: break;
case 3:
{ positionInc = 1; return CJ;
}
case 49: break;
case 4:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 50: break;
case 5:
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
case 51: break;
case 6:
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
}
case 52: break;
case 7:
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
}
case 53: break;
case 8:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 54: break;
case 9:
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
}
case 55: break;
case 10:
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 56: break;
case 11:
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 57: break;
case 12:
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
}
case 58: break;
case 13:
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 59: break;
case 14:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
}
case 60: break;
case 15:
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
}
case 61: break;
case 16:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
}
case 62: break;
case 17:
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
}
case 63: break;
case 18:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
}
case 64: break;
case 19:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
}
case 65: break;
case 20:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 66: break;
case 21:
{ yybegin(STRING); return currentTokType;/*pipe*/
}
case 67: break;
case 22:
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
}
case 68: break;
case 23:
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 69: break;
case 24:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 70: break;
case 25:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 71: break;
case 26:
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
}
case 72: break;
case 27:
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 73: break;
case 28:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 74: break;
case 29:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 75: break;
case 30:
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 76: break;
case 31:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
}
case 77: break;
case 32:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 78: break;
case 33:
{ positionInc = 1; return APOSTROPHE;
}
case 79: break;
case 34:
{ positionInc = 1; return HOST;
}
case 80: break;
case 35:
{ positionInc = 1; return NUM;
}
case 81: break;
case 36:
{ positionInc = 1; return COMPANY;
}
case 82: break;
case 37:
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 83: break;
case 38:
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
}
case 84: break;
case 39:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
}
case 85: break;
case 40:
{ positionInc = 1; return ACRONYM;
}
case 86: break;
case 41:
{ positionInc = 1; return EMAIL;
}
case 87: break;
case 42:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
}
case 88: break;
case 43:
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
case 89: break;
case 44:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 90: break;
case 45:
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 91: break;
case 46:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 92: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
}
else {
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
}
else {
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 47: break;
case 2:
{ positionInc = 1; return ALPHANUM;
}
// fall through
case 48: break;
case 3:
{ positionInc = 1; return CJ;
}
// fall through
case 49: break;
case 4:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 50: break;
case 5:
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 51: break;
case 6:
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
}
// fall through
case 52: break;
case 7:
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
}
// fall through
case 53: break;
case 8:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
// fall through
case 54: break;
case 9:
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
}
// fall through
case 55: break;
case 10:
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 56: break;
case 11:
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 57: break;
case 12:
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
}
// fall through
case 58: break;
case 13:
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 59: break;
case 14:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
}
// fall through
case 60: break;
case 15:
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 61: break;
case 16:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
}
// fall through
case 62: break;
case 17:
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
}
// fall through
case 63: break;
case 18:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
}
// fall through
case 64: break;
case 19:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
}
// fall through
case 65: break;
case 20:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 66: break;
case 21:
{ yybegin(STRING); return currentTokType;/*pipe*/
}
// fall through
case 67: break;
case 22:
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 68: break;
case 23:
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 69: break;
case 24:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 70: break;
case 25:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 71: break;
case 26:
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 72: break;
case 27:
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 73: break;
case 28:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 74: break;
case 29:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 75: break;
case 30:
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 76: break;
case 31:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
}
// fall through
case 77: break;
case 32:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 78: break;
case 33:
{ positionInc = 1; return APOSTROPHE;
}
// fall through
case 79: break;
case 34:
{ positionInc = 1; return HOST;
}
// fall through
case 80: break;
case 35:
{ positionInc = 1; return NUM;
}
// fall through
case 81: break;
case 36:
{ positionInc = 1; return COMPANY;
}
// fall through
case 82: break;
case 37:
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 83: break;
case 38:
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
}
// fall through
case 84: break;
case 39:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
}
// fall through
case 85: break;
case 40:
{ positionInc = 1; return ACRONYM;
}
// fall through
case 86: break;
case 41:
{ positionInc = 1; return EMAIL;
}
// fall through
case 87: break;
case 42:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
}
// fall through
case 88: break;
case 43:
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
// fall through
case 89: break;
case 44:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 90: break;
case 45:
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 91: break;
case 46:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
// fall through
case 92: break;
default:
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -499,7 +499,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
= TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString2 +"-[CDATA[";
String[] testGold = {
"one<![CDATA[<one><two>three<four></four></two></one>]]>two",

View File

@ -361,14 +361,14 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
StringBuilder bToken = new StringBuilder();
// exact max length:
for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
for(int i=0;i<UAX29URLEmailAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
bToken.append('b');
}
String bString = bToken.toString();
// first bString is exact max default length; next one is 1 too long
String input = "x " + bString + " " + bString + "b";
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"});
a.close();
}

View File

@ -467,7 +467,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
wordBreakTest.test(a);
}
@ -545,6 +545,80 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
/** simple emoji */
public void testEmoji() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
new String[] { "💩", "💩", "💩" },
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
}
/** emoji zwj sequence */
public void testEmojiSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
new String[] { "👩‍❤️‍👩" },
new String[] { "<EMOJI>" });
}
/** emoji zwj sequence with fitzpatrick modifier */
public void testEmojiSequenceWithModifier() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
new String[] { "👨🏼‍⚕️" },
new String[] { "<EMOJI>" });
}
/** regional indicator */
public void testEmojiRegionalIndicator() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
new String[] { "🇺🇸", "🇺🇸" },
new String[] { "<EMOJI>", "<EMOJI>" });
}
/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
new String[] { "#️⃣" },
new String[] { "<EMOJI>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3",
new String[] { "3",},
new String[] { "<EMOJI>" });
// text presentation sequences
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
new String[] { },
new String[] { });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
new String[] { "3\uFE0E",},
new String[] { "<NUM>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
new String[] { "\u2B55",},
new String[] { "<EMOJI>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
new String[] { "\u2B55", "\u200D\u2B55"},
new String[] { "<EMOJI>", "<EMOJI>" });
}
public void testEmojiTagSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
new String[] { "<EMOJI>" });
}
public void testEmojiTokenization() throws Exception {
// simple emoji around latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
new String[] { "poo", "💩", "poo" },
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
// simple emoji around non-latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
new String[] { "💩", "", "", "💩" },
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
}
public void testUnicodeEmojiTests() throws Exception {
EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
emojiTest.test(a);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

View File

@ -2388,7 +2388,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
<!-- JFlex task -->
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.7.0"
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
<property name="jflex.loaded" value="true"/>
@ -2645,7 +2645,11 @@ The following arguments can be provided to ant to alter its behaviour and target
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
<!-- The default skeleton is specified here to work around a JFlex ant task bug: -->
<!-- invocations with a non-default skeleton will cause following invocations to -->
<!-- use the same skeleton, though not specified, unless the default is configured. -->
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
skeleton="${common.dir}/core/src/data/jflex/skeleton.default"/>
</sequential>
</macrodef>
@ -2653,20 +2657,13 @@ The following arguments can be provided to ant to alter its behaviour and target
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
<!-- LUCENE-5897: Disallow scanner buffer expansion -->
<replaceregexp file="@{dir}/@{name}.java"
match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
replace="" flags="s" />
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
skeleton="${common.dir}/core/src/data/jflex/skeleton.disable.buffer.expansion.txt"/>
<!-- Since the ZZ_BUFFERSIZE declaration is generated rather than in the skeleton, we have to transform it here. -->
<replaceregexp file="@{dir}/@{name}.java"
match="private static final int ZZ_BUFFERSIZE ="
replace="private int ZZ_BUFFERSIZE ="/>
<replaceregexp file="@{dir}/@{name}.java"
match="int requested = zzBuffer.length - zzEndRead;"
replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
<replaceregexp file="@{dir}/@{name}.java"
match="(zzFinalHighSurrogate = 1;)(\r?\n)"
replace="\1\2 if (totalRead == 1) { return true; }\2"/>
</sequential>
</macrodef>

View File

@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file was automatically generated by getUnicodeEmojiProperties.pl
// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt
Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
Extended_Pictographic = [\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{2388}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2605}\u{2607}-\u{2612}\u{2614}-\u{2685}\u{2690}-\u{2705}\u{2708}-\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2767}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F000}-\u{1F0FF}\u{1F10D}-\u{1F10F}\u{1F12F}\u{1F16C}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AD}-\u{1F1E5}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F3FA}\u{1F400}-\u{1F53D}\u{1F546}-\u{1F64F}\u{1F680}-\u{1F6FF}\u{1F774}-\u{1F77F}\u{1F7D5}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1FFFD}]

View File

@ -0,0 +1,168 @@
#!/usr/bin/perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use warnings;
use strict;
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
my $version = '';
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
print STDERR "Usage: $script_name -v <version>\n";
print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
if ($version);
exit 1;
}
my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
my $output_filename = "UnicodeEmojiProperties.jflex";
my $header =<<"__HEADER__";
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file was automatically generated by ${script_name}
// from: ${emoji_data_url}
__HEADER__
my $property_ranges = {};
my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
output_jflex_include_file($output_path, $property_ranges);
# sub parse_emoji_data_file
#
# Downloads and parses the emoji_data.txt file, extracting code point ranges
# assigned to property values with age not younger than the passed-in version,
# except for the Extended_Pictographic property, for which all code point ranges
# are extracted, regardless of age.
#
# Parameters:
#
# - Emoji data file URL
# - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
# - Reference to hash of wanted property names
#
sub parse_emoji_data_file {
my $url = shift;
my $prop_ranges = shift;
my $wanted_props = shift;
my $content = get_URL_content($url);
print STDERR "Parsing '$url'...";
my @lines = split /\r?\n/, $content;
for (@lines) {
## 231A..231B ; Emoji_Presentation # 1.1 [2] (⌚..⌛) watch..hourglass done
## 1F9C0 ; Emoji_Presentation # 8.0 [1] (🧀) cheese wedge
## 1FA00..1FA5F ; Extended_Pictographic# NA [96] (🨀️..🩟️) <reserved-1FA00>..<reserved-1FA5F>
if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
next unless defined($wanted_props->{$prop}); # Skip unless we want ranges for this property
if (not defined($prop_ranges->{$prop})) {
$prop_ranges->{$prop} = [];
}
$end = $start unless defined($end);
my $start_dec = hex $start;
my $end_dec = hex $end;
my $ranges = $prop_ranges->{$prop};
if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
# print STDERR "Adding new range ($start, $end)\n";
push @$ranges, $start_dec, $end_dec;
} else {
# printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
$ranges->[-1] = $end_dec;
}
} else {
# print STDERR "Skipping line (no data): $_\n";
}
}
print STDERR "done.\n";
}
# sub get_URL_content
#
# Retrieves and returns the content of the given URL.
#
# Parameter:
#
# - URL to get content for
#
sub get_URL_content {
my $url = shift;
print STDERR "Retrieving '$url'...";
my $user_agent = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $url);
my $response = $user_agent->request($request);
unless ($response->is_success) {
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
exit 1;
}
print STDERR "done.\n";
return $response->content;
}
# sub output_jflex_include_file
#
# Parameters:
#
# - Output path
# - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
#
sub output_jflex_include_file {
my $path = shift;
my $prop_ranges = shift;
open OUT, ">$path"
|| die "Error opening '$path' for writing: $!";
print STDERR "Writing '$path'...";
print OUT $header;
for my $prop (sort keys %$prop_ranges) {
my $ranges = $prop_ranges->{$prop};
print OUT "$prop = [";
for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
printf OUT "\\u{%X}", $ranges->[$index];
printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
}
print OUT "]\n";
}
print OUT "\n";
close OUT;
print STDERR "done.\n";
}

View File

@ -0,0 +1,342 @@
/** This character denotes the end of file */
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
--- private static final int ZZ_BUFFERSIZE = ...;
/** lexical states */
--- lexical states, charmap
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
private static final int ZZ_PUSHBACK_2BIG = 2;
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
--- isFinal list
/** the input device */
private java.io.Reader zzReader;
/** the current state of the DFA */
private int zzState;
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
private int zzMarkedPos;
/** the current text position in the buffer */
private int zzCurrentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
/**
* The number of occupied positions in zzBuffer beyond zzEndRead.
* When a lead/high surrogate has been read from the input stream
* into the final zzBuffer position, this will have a value of 1;
* otherwise, it will have a value of 0.
*/
private int zzFinalHighSurrogate = 0;
--- user class code
--- constructor declaration
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
/* first: make room (if you can) */
if (zzStartRead > 0) {
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
/* translate stored positions */
zzEndRead-= zzStartRead;
zzCurrentPos-= zzStartRead;
zzMarkedPos-= zzStartRead;
zzStartRead = 0;
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
/* if not: blow it up */
char newBuffer[] = new char[zzBuffer.length*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
}
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead;
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
/* not supposed to occur according to specification of java.io.Reader */
if (numRead == 0) {
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
}
if (numRead > 0) {
zzEndRead += numRead;
/* If numRead == requested, we might have requested to few chars to
encode a full Unicode character. We assume that a Reader would
otherwise never return half characters. */
if (numRead == requested) {
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
/* potentially more input available */
return false;
}
/* numRead < 0 ==> end of stream */
return true;
}
/**
* Closes the input stream.
*/
public final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
if (zzReader != null)
zzReader.close();
}
/**
* Resets the scanner to read from a new input stream.
* Does not close the old reader.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
*
* Internal scan buffer is resized down to its initial length, if it has grown.
*
* @param reader the new input stream
*/
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
zzFinalHighSurrogate = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
zzBuffer = new char[ZZ_BUFFERSIZE];
}
/**
* Returns the current lexical state.
*/
public final int yystate() {
return zzLexicalState;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
public final void yybegin(int newState) {
zzLexicalState = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
public final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
public final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
/**
* Returns the length of the matched text region.
*/
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
--- zzScanError declaration
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
--- throws clause
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
--- yypushback decl (contains zzScanError exception)
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
--- zzDoEOF
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
--- yylex declaration
int zzInput;
int zzAction;
// cached fields:
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
char [] zzBufferL = zzBuffer;
char [] zzCMapL = ZZ_CMAP;
--- local declarations
while (true) {
zzMarkedPosL = zzMarkedPos;
--- start admin (line, char, col count)
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
--- start admin (lexstate etc)
zzForAction: {
while (true) {
--- next input, line, col, char count, next transition, isFinal action
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
--- line count update
}
}
}
// store back cached position
zzMarkedPos = zzMarkedPosL;
--- char count update
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
--- eofvalue
}
else {
--- actions
default:
--- no match
}
}
}
}
--- main
}

View File

@ -0,0 +1,348 @@
/** This character denotes the end of file */
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
--- private static final int ZZ_BUFFERSIZE = ...;
/** lexical states */
--- lexical states, charmap
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
private static final int ZZ_PUSHBACK_2BIG = 2;
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
--- isFinal list
/** the input device */
private java.io.Reader zzReader;
/** the current state of the DFA */
private int zzState;
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
private int zzMarkedPos;
/** the current text position in the buffer */
private int zzCurrentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
/**
* The number of occupied positions in zzBuffer beyond zzEndRead.
* When a lead/high surrogate has been read from the input stream
* into the final zzBuffer position, this will have a value of 1;
* otherwise, it will have a value of 0.
*/
private int zzFinalHighSurrogate = 0;
--- user class code
--- constructor declaration
/* -------------------------------------------------------------------------------- */
/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
/* first: make room (if you can) */
if (zzStartRead > 0) {
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
/* translate stored positions */
zzEndRead-= zzStartRead;
zzCurrentPos-= zzStartRead;
zzMarkedPos-= zzStartRead;
zzStartRead = 0;
}
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
if (requested == 0) {
return true;
}
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
/* not supposed to occur according to specification of java.io.Reader */
if (numRead == 0) {
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
}
if (numRead > 0) {
zzEndRead += numRead;
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
--zzEndRead;
zzFinalHighSurrogate = 1;
if (numRead == 1) {
return true;
}
} else { // There is room in the buffer for at least one more char
int c = zzReader.read(); // Expecting to read a low surrogate char
if (c == -1) {
return true;
} else {
zzBuffer[zzEndRead++] = (char)c;
return false;
}
}
}
/* potentially more input available */
return false;
}
/* numRead < 0 ==> end of stream */
return true;
}
/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
/* ------------------------------------------------------------------------------ */
/**
* Closes the input stream.
*/
public final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
if (zzReader != null)
zzReader.close();
}
/**
* Resets the scanner to read from a new input stream.
* Does not close the old reader.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
*
* Internal scan buffer is resized down to its initial length, if it has grown.
*
* @param reader the new input stream
*/
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
zzFinalHighSurrogate = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
zzBuffer = new char[ZZ_BUFFERSIZE];
}
/**
* Returns the current lexical state.
*/
public final int yystate() {
return zzLexicalState;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
public final void yybegin(int newState) {
zzLexicalState = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
public final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
public final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
/**
* Returns the length of the matched text region.
*/
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
--- zzScanError declaration
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
--- throws clause
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
--- yypushback decl (contains zzScanError exception)
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
--- zzDoEOF
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
--- yylex declaration
int zzInput;
int zzAction;
// cached fields:
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
char [] zzBufferL = zzBuffer;
char [] zzCMapL = ZZ_CMAP;
--- local declarations
while (true) {
zzMarkedPosL = zzMarkedPos;
--- start admin (line, char, col count)
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
--- start admin (lexstate etc)
zzForAction: {
while (true) {
--- next input, line, col, char count, next transition, isFinal action
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
--- line count update
}
}
}
// store back cached position
zzMarkedPos = zzMarkedPosL;
--- char count update
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
--- eofvalue
}
else {
--- actions
default:
--- no match
}
}
}
}
--- main
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.6.0 */
/* The following code was generated by JFlex 1.7.0 */
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
@ -65,147 +66,212 @@ public final class StandardTokenizerImpl {
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
"\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
"\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
"\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
"\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
"\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
"\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
"\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
"\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
"\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
"\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
"\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
"\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
"\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
"\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
"\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
"\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
"\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
"\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
"\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
"\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
"\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
"\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
"\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
"\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
"\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
"\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
"\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
"\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
"\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
"\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
"\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
"\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
"\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
"\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
"\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
"\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
"\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
"\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
"\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
"\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
"\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
"\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
"\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
"\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
"\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
"\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
"\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
"\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
"\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
"\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
"\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
"\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
"\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
"\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
"\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
"\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
"\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
"\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
"\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
"\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
"\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
"\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
"\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
"\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
"\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
"\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
"\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
"\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
"\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
"\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
"\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
"\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
"\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
"\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
"\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
"\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
"\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
"\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
"\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
"\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
"\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
"\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
"\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
"\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
"\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
"\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
"\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
"\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
"\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
"\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
"\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
"\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
"\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
"\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
"\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
"\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
"\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
"\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
"\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
"\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
"\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
"\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
"\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
"\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
"\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
"\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
"\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
"\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
"\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
"\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
"\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
"\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
"\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
"\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
"\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
"\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
"\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
"\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
"\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
"\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
"\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
"\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
"\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
"\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
"\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
"\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
"\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
"\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
"\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
"\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
"\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
"\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
"\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
"\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
"\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
"\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
"\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
"\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
"\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
"\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
"\42\0\1\32\1\7\3\0\1\31\2\0\1\7\1\0\1\24\1\0"+
"\1\25\1\0\12\21\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
"\32\15\56\0\1\4\1\15\2\0\1\5\1\4\6\0\1\15\1\0"+
"\1\23\2\0\1\15\5\0\27\15\1\0\37\15\1\0\u01ca\15\4\0"+
"\14\15\5\0\1\23\10\0\5\15\7\0\1\15\1\0\1\15\21\0"+
"\160\5\5\15\1\0\2\15\2\0\4\15\1\24\1\15\6\0\1\15"+
"\1\23\3\15\1\0\1\15\1\0\24\15\1\0\123\15\1\0\213\15"+
"\1\0\7\5\246\15\1\0\46\15\2\0\1\15\7\0\47\15\1\0"+
"\1\24\7\0\55\5\1\0\1\5\1\0\2\5\1\0\2\5\1\0"+
"\1\5\10\0\33\33\5\0\3\33\1\15\1\23\13\0\6\5\6\0"+
"\2\24\2\0\13\5\1\0\1\5\3\0\53\15\25\5\12\20\1\0"+
"\1\20\1\24\1\0\2\15\1\5\143\15\1\0\1\15\10\5\1\0"+
"\6\5\2\15\2\5\1\0\4\5\2\15\12\20\3\15\2\0\1\15"+
"\17\0\1\5\1\15\1\5\36\15\33\5\2\0\131\15\13\5\1\15"+
"\16\0\12\20\41\15\11\5\2\15\2\0\1\24\1\0\1\15\5\0"+
"\26\15\4\5\1\15\11\5\1\15\3\5\1\15\5\5\22\0\31\15"+
"\3\5\104\0\25\15\1\0\10\15\26\0\60\5\66\15\3\5\1\15"+
"\22\5\1\15\7\5\12\15\2\5\2\0\12\20\1\0\20\15\3\5"+
"\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\1\15"+
"\3\0\4\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5"+
"\1\15\10\0\1\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20"+
"\2\15\17\0\3\5\1\0\6\15\4\0\2\15\2\0\26\15\1\0"+
"\7\15\1\0\2\15\1\0\2\15\1\0\2\15\2\0\1\5\1\0"+
"\5\5\4\0\2\5\2\0\3\5\3\0\1\5\7\0\4\15\1\0"+
"\1\15\7\0\12\20\2\5\3\15\1\5\13\0\3\5\1\0\11\15"+
"\1\0\3\15\1\0\26\15\1\0\7\15\1\0\2\15\1\0\5\15"+
"\2\0\1\5\1\15\10\5\1\0\3\5\1\0\3\5\2\0\1\15"+
"\17\0\2\15\2\5\2\0\12\20\11\0\1\15\7\0\3\5\1\0"+
"\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\2\15\1\0"+
"\5\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5\10\0"+
"\2\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20\1\0\1\15"+
"\20\0\1\5\1\15\1\0\6\15\3\0\3\15\1\0\4\15\3\0"+
"\2\15\1\0\1\15\1\0\2\15\3\0\2\15\3\0\3\15\3\0"+
"\14\15\4\0\5\5\3\0\3\5\1\0\4\5\2\0\1\15\6\0"+
"\1\5\16\0\12\20\20\0\4\5\1\0\10\15\1\0\3\15\1\0"+
"\27\15\1\0\20\15\3\0\1\15\7\5\1\0\3\5\1\0\4\5"+
"\7\0\2\5\1\0\3\15\5\0\2\15\2\5\2\0\12\20\20\0"+
"\1\15\3\5\1\0\10\15\1\0\3\15\1\0\27\15\1\0\12\15"+
"\1\0\5\15\2\0\1\5\1\15\7\5\1\0\3\5\1\0\4\5"+
"\7\0\2\5\7\0\1\15\1\0\2\15\2\5\2\0\12\20\1\0"+
"\2\15\16\0\3\5\1\0\10\15\1\0\3\15\1\0\51\15\2\0"+
"\1\15\7\5\1\0\3\5\1\0\4\5\1\15\5\0\3\15\1\5"+
"\7\0\3\15\2\5\2\0\12\20\12\0\6\15\2\0\2\5\1\0"+
"\22\15\3\0\30\15\1\0\11\15\1\0\1\15\2\0\7\15\3\0"+
"\1\5\4\0\6\5\1\0\1\5\1\0\10\5\6\0\12\20\2\0"+
"\2\5\15\0\60\34\1\35\2\34\7\35\5\0\7\34\10\35\1\0"+
"\12\20\47\0\2\34\1\0\1\34\2\0\2\34\1\0\1\34\2\0"+
"\1\34\6\0\4\34\1\0\7\34\1\0\3\34\1\0\1\34\1\0"+
"\1\34\2\0\2\34\1\0\4\34\1\35\2\34\6\35\1\0\2\35"+
"\1\34\2\0\5\34\1\0\1\34\1\0\6\35\2\0\12\20\2\0"+
"\4\34\40\0\1\15\27\0\2\5\6\0\12\20\13\0\1\5\1\0"+
"\1\5\1\0\1\5\4\0\2\5\10\15\1\0\44\15\4\0\24\5"+
"\1\0\2\5\5\15\13\5\1\0\44\5\11\0\1\5\71\0\53\34"+
"\24\35\1\34\12\20\6\0\6\34\4\35\4\34\3\35\1\34\3\35"+
"\2\34\7\35\3\34\4\35\15\34\14\35\1\34\1\35\12\20\4\35"+
"\2\34\46\15\1\0\1\15\5\0\1\15\2\0\53\15\1\0\4\15"+
"\u0100\17\111\15\1\0\4\15\2\0\7\15\1\0\1\15\1\0\4\15"+
"\2\0\51\15\1\0\4\15\2\0\41\15\1\0\4\15\2\0\7\15"+
"\1\0\1\15\1\0\4\15\2\0\17\15\1\0\71\15\1\0\4\15"+
"\2\0\103\15\2\0\3\5\40\0\20\15\20\0\126\15\2\0\6\15"+
"\3\0\u026c\15\2\0\21\15\1\0\32\15\5\0\113\15\3\0\13\15"+
"\7\0\15\15\1\0\4\15\3\5\13\0\22\15\3\5\13\0\22\15"+
"\2\5\14\0\15\15\1\0\3\15\1\0\2\5\14\0\64\34\40\35"+
"\3\0\1\34\4\0\1\34\1\35\2\0\12\20\41\0\4\5\1\0"+
"\12\20\6\0\130\15\10\0\5\15\2\5\42\15\1\5\1\15\5\0"+
"\106\15\12\0\37\15\1\0\14\5\4\0\14\5\12\0\12\20\36\34"+
"\2\0\5\34\13\0\54\34\4\0\32\34\6\0\12\20\1\34\3\0"+
"\2\34\40\0\27\15\5\5\4\0\65\34\12\35\1\0\35\35\2\0"+
"\1\5\12\20\6\0\12\20\6\0\16\34\2\0\17\5\101\0\5\5"+
"\57\15\21\5\7\15\4\0\12\20\21\0\11\5\14\0\3\5\36\15"+
"\15\5\2\15\12\20\54\15\16\5\14\0\44\15\24\5\10\0\12\20"+
"\3\0\3\15\12\20\44\15\2\0\11\15\107\0\3\5\1\0\25\5"+
"\4\15\1\5\4\15\3\5\2\15\1\0\2\5\6\0\300\15\66\5"+
"\5\0\5\5\u0116\15\2\0\6\15\2\0\46\15\2\0\6\15\2\0"+
"\10\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\37\15\2\0"+
"\65\15\1\0\7\15\1\0\1\15\3\0\3\15\1\0\7\15\3\0"+
"\4\15\2\0\6\15\4\0\15\15\5\0\3\15\1\0\7\15\17\0"+
"\1\5\1\12\2\5\10\0\2\25\12\0\1\25\2\0\1\23\2\0"+
"\5\5\1\26\14\0\1\4\2\0\2\26\3\0\1\24\4\0\1\4"+
"\12\0\1\26\13\0\5\5\1\0\12\5\1\0\1\15\15\0\1\15"+
"\20\0\15\15\63\0\23\5\1\10\15\5\21\0\1\15\4\0\1\15"+
"\2\0\12\15\1\0\1\15\3\0\5\15\4\0\1\4\1\0\1\15"+
"\1\0\1\15\1\0\1\15\1\0\4\15\1\0\12\15\1\16\2\0"+
"\4\15\5\0\5\15\4\0\1\15\21\0\51\15\13\0\6\4\17\0"+
"\2\4\u016f\0\2\4\14\0\1\4\137\0\1\4\106\0\1\4\31\0"+
"\13\4\4\0\3\4\273\0\14\15\1\16\47\15\300\0\2\4\12\0"+
"\1\4\11\0\1\4\72\0\4\4\1\0\5\4\1\4\1\0\7\4"+
"\1\4\2\4\1\4\1\4\1\0\2\4\2\4\1\4\4\4\1\3"+
"\2\4\1\4\1\4\2\4\2\4\1\4\3\4\1\4\3\4\2\4"+
"\10\4\3\4\5\4\1\4\1\4\1\4\5\4\14\4\13\4\2\4"+
"\2\4\1\4\1\4\2\4\1\4\1\4\22\4\1\4\2\4\2\4"+
"\6\4\12\0\2\4\6\4\1\4\1\4\1\4\2\4\3\4\2\4"+
"\10\4\2\4\4\4\2\4\13\4\2\4\5\4\2\4\2\4\1\4"+
"\5\4\2\4\1\4\1\4\1\4\2\4\24\4\2\4\5\4\6\4"+
"\1\4\2\4\1\3\1\4\2\4\1\4\4\4\1\4\2\4\1\4"+
"\2\0\2\4\4\3\1\4\1\4\2\4\1\4\1\0\1\4\1\0"+
"\1\4\6\0\1\4\3\0\1\4\6\0\1\4\12\0\2\4\17\0"+
"\1\4\2\0\1\4\4\0\1\4\1\0\1\4\4\0\3\4\1\0"+
"\1\4\13\0\2\4\3\4\55\0\3\4\11\0\1\4\16\0\1\4"+
"\16\0\1\4\u0174\0\2\4\u01cf\0\3\4\23\0\2\4\63\0\1\4"+
"\4\0\1\4\252\0\57\15\1\0\57\15\1\0\205\15\6\0\4\15"+
"\3\5\2\15\14\0\46\15\1\0\1\15\5\0\1\15\2\0\70\15"+
"\7\0\1\15\17\0\1\5\27\15\11\0\7\15\1\0\7\15\1\0"+
"\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0"+
"\7\15\1\0\40\5\57\0\1\15\120\0\32\27\1\0\131\27\14\0"+
"\326\27\57\0\1\15\1\0\1\27\31\0\11\27\6\5\1\4\5\22"+
"\2\0\3\27\1\15\1\15\1\4\3\0\126\30\2\0\2\5\2\22"+
"\3\30\133\22\1\0\4\22\5\0\51\15\3\0\136\17\21\0\33\15"+
"\65\0\20\22\227\0\1\4\1\0\1\4\66\0\57\22\1\0\130\22"+
"\250\0\u19b6\27\112\0\u51d6\27\52\0\u048d\15\103\0\56\15\2\0\u010d\15"+
"\3\0\20\15\12\20\2\15\24\0\57\15\4\5\1\0\12\5\1\0"+
"\37\15\2\5\120\15\2\5\45\0\11\15\2\0\147\15\2\0\44\15"+
"\1\0\10\15\77\0\13\15\1\5\3\15\1\5\4\15\1\5\27\15"+
"\5\5\30\0\64\15\14\0\2\5\62\15\22\5\12\0\12\20\6\0"+
"\22\5\6\15\3\0\1\15\1\0\1\15\2\0\12\20\34\15\10\5"+
"\2\0\27\15\15\5\14\0\35\17\3\0\4\5\57\15\16\5\16\0"+
"\1\15\12\20\6\0\5\34\1\35\12\34\12\20\5\34\1\0\51\15"+
"\16\5\11\0\3\15\1\5\10\15\2\5\2\0\12\20\6\0\33\34"+
"\3\35\62\34\1\35\1\34\3\35\2\34\2\35\5\34\2\35\1\34"+
"\1\35\1\34\30\0\5\34\13\15\5\5\2\0\3\15\2\5\12\0"+
"\6\15\2\0\6\15\2\0\6\15\11\0\7\15\1\0\7\15\1\0"+
"\53\15\1\0\12\15\12\0\163\15\10\5\1\0\2\5\2\0\12\20"+
"\6\0\u2ba4\17\14\0\27\17\4\0\61\17\u2104\0\u016e\27\2\0\152\27"+
"\46\0\7\15\14\0\5\15\5\0\1\33\1\5\12\33\1\0\15\33"+
"\1\0\5\33\1\0\1\33\1\0\2\33\1\0\2\33\1\0\12\33"+
"\142\15\41\0\u016b\15\22\0\100\15\2\0\66\15\50\0\14\15\4\0"+
"\16\5\1\6\1\11\1\24\2\0\1\23\1\24\13\0\20\5\3\0"+
"\2\26\30\0\3\26\1\24\1\0\1\25\1\0\1\24\1\23\32\0"+
"\5\15\1\0\207\15\2\0\1\5\7\0\1\25\4\0\1\24\1\0"+
"\1\25\1\0\12\20\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
"\32\15\13\0\70\22\2\5\37\17\3\0\6\17\2\0\6\17\2\0"+
"\6\17\2\0\3\17\34\0\3\5\4\0\14\15\1\0\32\15\1\0"+
"\23\15\1\0\2\15\1\0\17\15\2\0\16\15\42\0\173\15\105\0"+
"\65\15\210\0\1\5\202\0\35\15\3\0\61\15\17\0\1\5\37\0"+
"\40\15\20\0\33\15\5\0\46\15\5\5\5\0\36\15\2\0\44\15"+
"\4\0\10\15\1\0\5\15\52\0\236\15\2\0\12\20\6\0\44\15"+
"\4\0\44\15\4\0\50\15\10\0\64\15\234\0\u0137\15\11\0\26\15"+
"\12\0\10\15\230\0\6\15\2\0\1\15\1\0\54\15\1\0\2\15"+
"\3\0\1\15\2\0\27\15\12\0\27\15\11\0\37\15\101\0\23\15"+
"\1\0\2\15\12\0\26\15\12\0\32\15\106\0\70\15\6\0\2\15"+
"\100\0\1\15\3\5\1\0\2\5\5\0\4\5\4\15\1\0\3\15"+
"\1\0\33\15\4\0\3\5\4\0\1\5\40\0\35\15\3\0\35\15"+
"\43\0\10\15\1\0\34\15\2\5\31\0\66\15\12\0\26\15\12\0"+
"\23\15\15\0\22\15\156\0\111\15\67\0\63\15\15\0\63\15\u030d\0"+
"\3\5\65\15\17\5\37\0\12\20\17\0\4\5\55\15\13\5\2\0"+
"\1\5\22\0\31\15\7\0\12\20\6\0\3\5\44\15\16\5\1\0"+
"\12\20\20\0\43\15\1\5\2\0\1\15\11\0\3\5\60\15\16\5"+
"\4\15\5\0\3\5\3\0\12\20\1\15\1\0\1\15\43\0\22\15"+
"\1\0\31\15\14\5\6\0\1\5\101\0\7\15\1\0\1\15\1\0"+
"\4\15\1\0\17\15\1\0\12\15\7\0\57\15\14\5\5\0\12\20"+
"\6\0\4\5\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15"+
"\1\0\2\15\1\0\5\15\2\0\1\5\1\15\7\5\2\0\2\5"+
"\2\0\3\5\2\0\1\15\6\0\1\5\5\0\5\15\2\5\2\0"+
"\7\5\3\0\5\5\213\0\65\15\22\5\4\15\5\0\12\20\46\0"+
"\60\15\24\5\2\15\1\0\1\15\10\0\12\20\246\0\57\15\7\5"+
"\2\0\11\5\27\0\4\15\2\5\42\0\60\15\21\5\3\0\1\15"+
"\13\0\12\20\46\0\53\15\15\5\10\0\12\20\66\0\32\34\3\0"+
"\17\35\4\0\12\20\2\34\3\0\1\34\u0160\0\100\15\12\20\25\0"+
"\1\15\u01c0\0\71\15\u0107\0\11\15\1\0\45\15\10\5\1\0\10\5"+
"\1\15\17\0\12\20\30\0\36\15\2\0\26\5\1\0\16\5\u0349\0"+
"\u039a\15\146\0\157\15\21\0\304\15\u0abc\0\u042f\15\u0fd1\0\u0247\15\u21b9\0"+
"\u0239\15\7\0\37\15\1\0\12\20\146\0\36\15\2\0\5\5\13\0"+
"\60\15\7\5\11\0\4\15\14\0\12\20\11\0\25\15\5\0\23\15"+
"\u0370\0\105\15\13\0\1\15\56\5\20\0\4\5\15\15\100\0\1\15"+
"\u401f\0\1\22\1\30\u0bfe\0\153\15\5\0\15\15\3\0\11\15\7\0"+
"\12\15\3\0\2\5\1\0\4\5\u14c1\0\5\5\3\0\26\5\2\0"+
"\7\5\36\0\4\5\224\0\3\5\u01bb\0\125\15\1\0\107\15\1\0"+
"\2\15\2\0\1\15\2\0\2\15\2\0\4\15\1\0\14\15\1\0"+
"\1\15\1\0\7\15\1\0\101\15\1\0\4\15\2\0\10\15\1\0"+
"\7\15\1\0\34\15\1\0\4\15\1\0\5\15\1\0\1\15\3\0"+
"\7\15\1\0\u0154\15\2\0\31\15\1\0\31\15\1\0\37\15\1\0"+
"\31\15\1\0\37\15\1\0\31\15\1\0\37\15\1\0\31\15\1\0"+
"\37\15\1\0\31\15\1\0\10\15\2\0\62\20\u0200\0\67\5\4\0"+
"\62\5\10\0\1\5\16\0\1\5\26\0\5\5\1\0\17\5\u0550\0"+
"\7\5\1\0\21\5\2\0\7\5\1\0\2\5\1\0\5\5\u07d5\0"+
"\305\15\13\0\7\5\51\0\104\15\7\5\5\0\12\20\u04a6\0\4\15"+
"\1\0\33\15\1\0\2\15\1\0\1\15\2\0\1\15\1\0\12\15"+
"\1\0\4\15\1\0\1\15\1\0\1\15\6\0\1\15\4\0\1\15"+
"\1\0\1\15\1\0\1\15\1\0\3\15\1\0\2\15\1\0\1\15"+
"\2\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15"+
"\1\0\2\15\1\0\1\15\2\0\4\15\1\0\7\15\1\0\4\15"+
"\1\0\4\15\1\0\1\15\1\0\12\15\1\0\21\15\5\0\3\15"+
"\1\0\5\15\1\0\21\15\u0144\0\4\4\1\4\312\4\1\4\60\4"+
"\15\0\3\4\37\0\1\4\32\15\6\0\32\15\2\0\4\4\2\16"+
"\14\15\2\16\12\15\4\0\1\4\2\0\12\4\22\0\71\4\32\1"+
"\1\30\2\4\15\4\12\0\1\4\24\0\1\4\2\0\11\4\1\0"+
"\4\4\11\0\7\4\2\4\256\4\42\4\2\4\141\4\1\3\16\4"+
"\2\4\2\4\1\4\3\4\2\4\44\4\3\3\2\4\1\3\2\4"+
"\3\3\44\4\2\4\3\4\1\4\4\4\5\2\102\4\2\3\2\4"+
"\13\3\25\4\4\3\4\4\1\3\1\4\11\3\3\4\1\3\4\4"+
"\3\3\1\4\3\3\42\4\1\3\123\4\1\4\77\4\10\0\3\4"+
"\6\4\1\4\30\4\7\4\2\4\2\4\1\4\2\3\4\4\1\3"+
"\14\4\1\4\2\4\4\4\2\4\1\3\4\4\2\3\15\4\2\4"+
"\2\4\1\4\10\4\2\4\11\4\1\4\5\4\3\4\14\4\3\4"+
"\10\4\3\4\2\4\1\4\1\4\1\4\4\4\1\4\6\4\1\4"+
"\3\4\1\4\6\4\113\4\3\3\3\4\5\3\60\0\43\4\1\3"+
"\20\4\3\3\11\4\1\3\5\4\5\4\1\4\1\3\6\4\15\4"+
"\6\4\3\4\1\4\1\4\2\4\3\4\1\4\2\4\7\4\6\4"+
"\164\0\14\4\125\0\53\4\14\0\4\4\70\0\10\4\12\0\6\4"+
"\50\0\10\4\36\0\122\4\14\0\4\4\10\4\5\3\1\4\2\3"+
"\6\4\1\3\11\4\12\3\1\4\1\0\1\4\2\3\1\4\6\4"+
"\1\0\52\4\2\4\4\4\3\4\1\4\1\4\47\4\15\4\5\4"+
"\2\3\1\4\2\3\6\4\3\4\15\4\1\4\15\3\42\4\u05fe\4"+
"\2\0\ua6d7\27\51\0\u1035\27\13\0\336\27\2\0\u1682\27\u295e\0\u021e\27"+
"\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
"\1\5\36\0\137\13\1\14\200\0\360\5\uffff\0\uffff\0\ufe12\0";
/**
* Translates characters to character classes
@ -218,12 +284,15 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
"\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
"\1\4\1\0\2\2\2\0\1\1\1\0";
"\1\0\2\1\3\2\2\1\1\3\1\2\1\4\2\5"+
"\1\6\1\1\1\7\1\10\1\3\1\11\1\2\1\0"+
"\4\2\1\0\1\2\2\0\1\3\1\0\1\3\2\2"+
"\1\0\1\5\1\2\1\5\1\0\2\3\1\0\2\2"+
"\2\0\1\2\1\0\2\3\5\2\1\0\1\2\1\3"+
"\3\2";
private static int [] zzUnpackAction() {
int [] result = new int[24];
int [] result = new int[61];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -248,12 +317,17 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
"\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
"\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
"\0\0\0\36\0\74\0\132\0\170\0\226\0\264\0\322"+
"\0\360\0\u010e\0\u012c\0\u014a\0\u0168\0\u0186\0\u01a4\0\u01c2"+
"\0\u01e0\0\u01fe\0\u021c\0\u023a\0\74\0\u0258\0\u0276\0\u0294"+
"\0\u02b2\0\264\0\u02d0\0\u02ee\0\322\0\u030c\0\u032a\0\u0348"+
"\0\u0366\0\u0384\0\u03a2\0\u03c0\0\u03de\0\u03fc\0\u01a4\0\u041a"+
"\0\u0438\0\u0456\0\u0474\0\u0492\0\u04b0\0\u04ce\0\u04ec\0\u050a"+
"\0\u0528\0\u0546\0\u0564\0\u0582\0\u05a0\0\u05be\0\u05dc\0\u05fa"+
"\0\36\0\u0618\0\360\0\u0636\0\u0654";
private static int [] zzUnpackRowMap() {
int [] result = new int[24];
int [] result = new int[61];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -276,33 +350,94 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
"\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
"\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
"\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
"\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
"\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
"\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
"\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
"\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
"\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
"\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
"\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
"\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
"\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
"\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
"\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
"\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
"\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
"\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
"\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
"\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
"\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
"\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
"\1\30\1\15\14\0\1\30";
"\1\2\1\3\1\4\1\5\1\6\2\2\1\7\2\2"+
"\1\10\2\2\1\11\1\12\1\13\1\14\1\15\1\16"+
"\3\2\1\17\1\20\1\21\2\2\1\22\2\23\37\0"+
"\1\24\3\0\2\25\1\0\5\25\20\0\1\25\5\0"+
"\1\4\2\0\1\4\1\0\1\26\2\4\20\0\1\4"+
"\2\0\1\4\2\0\1\5\2\0\1\5\1\27\1\30"+
"\2\5\20\0\1\5\5\0\1\6\2\0\1\6\1\27"+
"\1\31\2\6\20\0\1\6\5\0\1\32\2\0\1\33"+
"\1\34\3\32\20\0\1\32\3\0\1\5\1\6\5\0"+
"\1\35\3\0\1\6\24\0\2\11\1\0\10\11\2\36"+
"\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
"\1\22\1\0\1\11\5\0\1\12\1\11\1\0\1\12"+
"\1\41\1\42\2\12\3\11\2\36\1\0\1\37\1\0"+
"\1\37\1\40\2\0\1\37\1\0\1\22\1\0\1\12"+
"\5\0\2\13\1\0\5\13\2\11\1\13\2\36\1\0"+
"\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
"\1\0\1\13\5\0\2\14\1\0\5\14\3\11\2\14"+
"\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
"\1\14\5\0\1\15\1\14\1\0\1\45\1\46\3\15"+
"\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
"\1\22\1\0\1\15\5\0\2\16\1\0\5\16\5\0"+
"\1\16\3\0\1\40\6\0\1\16\5\0\2\47\1\0"+
"\5\47\3\11\2\14\1\50\3\0\1\47\4\0\1\22"+
"\1\0\1\47\5\0\2\20\1\0\5\20\20\0\1\20"+
"\5\0\2\21\1\0\5\21\20\0\1\21\5\0\2\22"+
"\1\0\5\22\3\11\2\36\1\0\1\37\1\0\1\37"+
"\1\40\2\0\1\51\1\52\1\22\1\0\1\22\5\0"+
"\2\23\1\0\5\23\17\0\2\23\5\0\2\24\1\0"+
"\5\24\20\0\1\24\2\0\1\4\1\53\1\54\1\4"+
"\2\0\1\4\1\0\1\26\2\4\1\0\1\54\16\0"+
"\1\4\12\0\1\55\1\56\24\0\1\4\1\53\1\54"+
"\1\5\2\0\1\5\1\27\1\30\2\5\1\0\1\54"+
"\16\0\1\5\2\0\1\4\1\53\1\54\1\6\2\0"+
"\1\6\1\27\1\31\2\6\1\0\1\54\16\0\1\6"+
"\5\0\1\33\2\0\1\33\1\34\3\33\20\0\1\33"+
"\10\0\1\57\32\0\2\36\1\0\5\36\3\11\2\36"+
"\2\0\2\60\1\40\2\0\1\60\1\0\1\22\1\0"+
"\1\36\5\0\2\37\1\0\5\37\3\11\13\0\1\11"+
"\1\0\1\37\5\0\2\40\1\0\5\40\3\11\2\36"+
"\1\50\3\0\1\40\4\0\1\22\1\0\1\40\5\0"+
"\2\11\1\0\2\11\1\61\1\62\4\11\2\36\1\0"+
"\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
"\1\0\1\11\2\0\1\4\1\53\1\54\1\12\1\11"+
"\1\0\1\12\1\41\1\42\2\12\1\11\1\63\1\11"+
"\2\36\1\0\1\37\1\0\1\37\1\40\2\0\1\37"+
"\1\0\1\22\1\0\1\12\5\0\2\43\1\0\5\43"+
"\3\0\2\14\13\0\1\43\5\0\2\44\1\0\5\44"+
"\3\11\2\14\1\50\3\0\1\44\4\0\1\22\1\0"+
"\1\44\5\0\1\45\1\14\1\0\1\45\1\46\3\45"+
"\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
"\1\22\1\0\1\45\5\0\2\14\1\0\1\64\4\14"+
"\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
"\1\22\1\0\1\14\5\0\2\50\1\0\5\50\5\0"+
"\1\50\3\0\1\40\6\0\1\50\5\0\2\51\1\0"+
"\5\51\3\11\2\36\4\0\1\40\4\0\1\22\1\0"+
"\1\51\5\0\2\52\1\0\5\52\16\0\1\51\1\0"+
"\1\52\2\0\1\4\2\0\1\53\2\0\1\53\1\65"+
"\1\66\2\53\20\0\1\53\5\0\1\54\2\0\1\54"+
"\1\65\1\67\2\54\20\0\1\54\2\0\1\4\1\53"+
"\1\54\5\0\1\70\3\0\1\54\32\0\1\56\1\71"+
"\26\0\1\57\2\0\1\57\1\0\3\57\20\0\1\57"+
"\5\0\2\60\1\0\5\60\3\0\2\36\13\0\1\60"+
"\2\0\1\4\1\53\1\54\2\11\1\0\2\11\1\72"+
"\3\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
"\1\40\2\0\1\37\1\0\1\22\1\0\1\11\5\0"+
"\2\11\1\0\3\11\1\62\1\73\3\11\2\36\1\0"+
"\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
"\1\0\1\11\5\0\1\63\1\11\1\0\1\63\1\74"+
"\1\75\2\63\3\11\2\36\1\0\1\37\1\0\1\37"+
"\1\40\2\0\1\37\1\0\1\22\1\0\1\63\5\0"+
"\1\64\1\14\1\0\1\64\1\14\3\64\3\11\2\14"+
"\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
"\1\64\12\0\1\55\25\0\1\4\1\53\1\54\1\53"+
"\2\0\1\53\1\65\1\66\2\53\1\0\1\54\16\0"+
"\1\53\2\0\1\4\1\53\2\54\2\0\1\54\1\65"+
"\1\67\2\54\1\0\1\54\16\0\1\54\3\0\1\53"+
"\1\54\5\0\1\70\3\0\1\54\22\0\1\53\1\54"+
"\2\11\1\0\2\11\1\72\3\11\1\63\1\11\2\36"+
"\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
"\1\22\1\0\1\11\5\0\2\11\1\0\2\11\1\61"+
"\5\11\2\36\1\0\1\37\1\0\1\37\1\40\2\0"+
"\1\37\1\0\1\22\1\0\1\11\2\0\1\4\1\53"+
"\1\54\1\63\1\11\1\0\1\63\1\74\1\75\2\63"+
"\1\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
"\1\40\2\0\1\37\1\0\1\22\1\0\1\63";
private static int [] zzUnpackTrans() {
int [] result = new int[396];
int [] result = new int[1650];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -329,7 +464,7 @@ public final class StandardTokenizerImpl {
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Unknown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
@ -340,11 +475,12 @@ public final class StandardTokenizerImpl {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
"\2\1\2\0\1\1\1\0";
"\1\0\1\11\22\1\1\0\4\1\1\0\1\1\2\0"+
"\1\1\1\0\3\1\1\0\3\1\1\0\2\1\1\0"+
"\2\1\2\0\1\1\1\0\7\1\1\0\1\11\4\1";
private static int [] zzUnpackAttribute() {
int [] result = new int[24];
int [] result = new int[61];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -401,11 +537,11 @@ public final class StandardTokenizerImpl {
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
* zzAtBOL == true iff the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
/** zzAtEOF == true iff the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
@ -447,6 +583,9 @@ public final class StandardTokenizerImpl {
/** Hangul token type */
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
/** Emoji token type */
public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
/** Character count processed so far */
public final int yychar()
@ -492,7 +631,7 @@ public final class StandardTokenizerImpl {
char [] map = new char[0x110000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 2836) {
while (i < 4122) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@ -500,6 +639,8 @@ public final class StandardTokenizerImpl {
return map;
}
/* -------------------------------------------------------------------------------- */
/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
/**
* Refills the input buffer.
@ -527,32 +668,45 @@ public final class StandardTokenizerImpl {
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
if (numRead == -1) {
break;
}
totalRead += numRead;
int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
if (requested == 0) {
return true;
}
int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
if (totalRead > 0) {
zzEndRead += totalRead;
if (totalRead == requested) { /* possibly more input available */
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
/* not supposed to occur according to specification of java.io.Reader */
if (numRead == 0) {
throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
}
if (numRead > 0) {
zzEndRead += numRead;
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
--zzEndRead;
zzFinalHighSurrogate = 1;
if (totalRead == 1) { return true; }
if (numRead == 1) {
return true;
}
} else { // There is room in the buffer for at least one more char
int c = zzReader.read(); // Expecting to read a low surrogate char
if (c == -1) {
return true;
} else {
zzBuffer[zzEndRead++] = (char)c;
return false;
}
}
}
/* potentially more input available */
return false;
}
// totalRead = 0: End of stream
/* numRead < 0 ==> end of stream */
return true;
}
/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
/* ------------------------------------------------------------------------------ */
/**
* Closes the input stream.
@ -773,49 +927,62 @@ public final class StandardTokenizerImpl {
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 9: break;
case 2:
{ return WORD_TYPE;
}
case 10: break;
case 3:
{ return HANGUL_TYPE;
}
case 11: break;
case 4:
{ return NUMERIC_TYPE;
}
case 12: break;
case 5:
{ return KATAKANA_TYPE;
}
case 13: break;
case 6:
{ return IDEOGRAPHIC_TYPE;
}
case 14: break;
case 7:
{ return HIRAGANA_TYPE;
}
case 15: break;
case 8:
{ return SOUTH_EAST_ASIAN_TYPE;
}
case 16: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
{
return YYEOF;
}
}
else {
}
else {
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */
}
// fall through
case 10: break;
case 2:
{ return EMOJI_TYPE;
}
// fall through
case 11: break;
case 3:
{ return WORD_TYPE;
}
// fall through
case 12: break;
case 4:
{ return HANGUL_TYPE;
}
// fall through
case 13: break;
case 5:
{ return NUMERIC_TYPE;
}
// fall through
case 14: break;
case 6:
{ return KATAKANA_TYPE;
}
// fall through
case 15: break;
case 7:
{ return IDEOGRAPHIC_TYPE;
}
// fall through
case 16: break;
case 8:
{ return HIRAGANA_TYPE;
}
// fall through
case 17: break;
case 9:
{ return SOUTH_EAST_ASIAN_TYPE;
}
// fall through
case 18: break;
default:
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -34,12 +34,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
%%
%unicode 6.3
%unicode 9.0
%integer
%final
%public
@ -48,22 +49,67 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%char
%buffer 255
// UAX#29 WB4. X (Extend | Format)* --> X
//////////////////////////////////////////////////////////////////////////
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
%include ../../../../../../data/jflex/UnicodeEmojiProperties.jflex
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
// - are explicitly excluded here so that we can properly handle Emoji sequences.
//
ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
KeyCapBaseChar = [0-9#*]
KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
KeyCap = \u20E3
KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
AccidentalEmoji = [©®™\u3030\u303D]
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiPresentationSelector = \uFE0F
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
TagSpec = [\u{E0020}-\u{E007E}]
TagTerm = \u{E007F}
// End Emoji Macros
//////////////////////////////////////////////////////////////////////////
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
AHLetterEx = [\p{WB:ALetter}\p{WB:Hebrew_Letter}] {ExtFmtZwj}
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] {ExtFmtZwj}
KatakanaEx = \p{WB:Katakana} {ExtFmtZwj}
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] {ExtFmtZwj}
ExtendNumLetEx = \p{WB:ExtendNumLet} {ExtFmtZwj}
HanEx = \p{Script:Han} {ExtFmtZwj}
HiraganaEx = \p{Script:Hiragana} {ExtFmtZwj}
SingleQuoteEx = \p{WB:Single_Quote} {ExtFmtZwj}
DoubleQuoteEx = \p{WB:Double_Quote} {ExtFmtZwj}
HebrewLetterEx = \p{WB:Hebrew_Letter} {ExtFmtZwj}
RegionalIndicatorEx = \p{WB:Regional_Indicator} {ExtFmtZwj}
ComplexContextEx = \p{LB:Complex_Context} {ExtFmtZwj}
%{
/** Alphanumeric sequences */
@ -93,6 +139,9 @@ ComplexContextEx = \p{LB:Complex_Context}
/** Hangul token type */
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
/** Emoji token type */
public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;
/** Character count processed so far */
public final int yychar()
@ -120,18 +169,64 @@ ComplexContextEx = \p{LB:Complex_Context}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
// UAX#29 WB1. sot ÷ Any
// WB2. Any ÷ eot
//
<<EOF>> { return YYEOF; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
// WB14. (E_Base | EBG) × E_Modifier
// WB15. ^ (RI RI)* RI × RI
// WB16. [^RI] (RI RI)* RI × RI
//
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
//
// emoji_sequence :=
// Top-level EBNF Expanded #1 Expanded #2 Expanded #3
// --------------------- ---------------------------- ----------------------------- ----------------------------------------------
// emoji_core_sequence emoji_combining_sequence emoji_character ( \p{Emoji}
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
// | emoji_keycap_sequence | [0-9#*] \u{FE0F 20E3} [1]
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
// | emoji_flag_sequence | \p{WB:Regional_Indicator}{2} )
//
// | emoji_zwj_sequence emoji_zwj_element emoji_character ( \p{Emoji}
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
// ( ZWJ emoji_zwj_element )+ ( \p{WB:ZWJ} ^^ )+
//
// | emoji_tag_sequence tag_base emoji_character ( \p{Emoji}
// | emoji_presentation_sequence | \p{Emoji} \uFE0F
// | emoji_modifier_sequence | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
// tag_spec [\u{E0020}-\u{E007E}]+
// tag_term \u{E007F}
//
// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences
// WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
// TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
// choose whether to support them for segmentation. This implementation will
// recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji.
//
// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
// https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
//
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
//
// WB3c ZWJ × (Extended_Pictographic | EmojiNRK)
//
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
| {RegionalIndicatorEx}{2}
{ return EMOJI_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLetQ) × Numeric
// WB12. Numeric × (MidNum | MidNumLetQ) Numeric
// WB13a. (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (AHLetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
@ -141,28 +236,28 @@ ComplexContextEx = \p{LB:Complex_Context}
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
// UAX#29 WB5. AHLetter × AHLetter
// WB6. AHLetter × (MidLetter | MidNumLetQ) AHLetter
// WB7. AHLetter (MidLetter | MidNumLetQ) × AHLetter
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. AHLetter × Numeric
// WB10. Numeric × AHLetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {AHLetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {AHLetterEx} )*
)+
)
)*
@ -172,13 +267,13 @@ ComplexContextEx = \p{LB:Complex_Context}
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -191,17 +286,14 @@ ComplexContextEx = \p{LB:Complex_Context}
//
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
// UAX#29 WB999. Any ÷ Any
//
{HanEx} { return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB999. Any ÷ Any
//
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }

View File

@ -18,8 +18,11 @@ package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@ -27,6 +30,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
@ -282,7 +286,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
wordBreakTest.test(a);
}
@ -358,8 +362,80 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
}
/** simple emoji */
public void testEmoji() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
new String[] { "💩", "💩", "💩" },
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
}
/** emoji zwj sequence */
public void testEmojiSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
new String[] { "👩‍❤️‍👩" },
new String[] { "<EMOJI>" });
}
/** emoji zwj sequence with fitzpatrick modifier */
public void testEmojiSequenceWithModifier() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
new String[] { "👨🏼‍⚕️" },
new String[] { "<EMOJI>" });
}
/** regional indicator */
public void testEmojiRegionalIndicator() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
new String[] { "🇺🇸", "🇺🇸" },
new String[] { "<EMOJI>", "<EMOJI>" });
}
/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
new String[] { "#️⃣" },
new String[] { "<EMOJI>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3",
new String[] { "3",},
new String[] { "<EMOJI>" });
// text presentation sequences
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
new String[] { },
new String[] { });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend}
new String[] { "3\uFE0E",},
new String[] { "<NUM>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
new String[] { "\u2B55",},
new String[] { "<EMOJI>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
new String[] { "\u2B55", "\u200D\u2B55"},
new String[] { "<EMOJI>", "<EMOJI>" });
}
public void testEmojiTagSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
new String[] { "<EMOJI>" });
}
public void testEmojiTokenization() throws Exception {
// simple emoji around latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
new String[] { "poo", "💩", "poo" },
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
// simple emoji around non-latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
new String[] { "💩", "", "", "💩" },
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
}
public void testUnicodeEmojiTests() throws Exception {
EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
emojiTest.test(a);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new StandardAnalyzer();
@ -416,4 +492,53 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
a.close();
}
public void testSplitSurrogatePairWithSpoonFeedReader() throws Exception {
String text = "12345678\ud800\udf00"; // U+D800 U+DF00 = U+10300 = 𐌀 (OLD ITALIC LETTER A)
// Collect tokens with normal reader
StandardAnalyzer a = new StandardAnalyzer();
TokenStream ts = a.tokenStream("dummy", text);
List<String> tokens = new ArrayList<>();
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
tokens.add(termAtt.toString());
}
ts.end();
ts.close();
// Tokens from a spoon-feed reader should be the same as from a normal reader
// The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the surrogate pair at a read boundary
Reader reader = new SpoonFeedMaxCharsReaderWrapper(9, new StringReader(text));
ts = a.tokenStream("dummy", reader);
termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
for (int tokenNum = 0 ; ts.incrementToken() ; ++tokenNum) {
assertEquals("token #" + tokenNum + " mismatch: ", termAtt.toString(), tokens.get(tokenNum));
}
ts.end();
ts.close();
}
}
class SpoonFeedMaxCharsReaderWrapper extends Reader {
private final Reader in;
private final int maxChars;
public SpoonFeedMaxCharsReaderWrapper(int maxChars, Reader in) {
this.in = in;
this.maxChars = maxChars;
}
@Override
public void close() throws IOException {
in.close();
}
/** Returns the configured number of chars if available */
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return in.read(cbuf, off, Math.min(maxChars, len));
}
}

View File

@ -0,0 +1,150 @@
#!/usr/bin/perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use warnings;
use strict;
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
my $version = '';
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
print STDERR "Usage: $script_name -v <version>\n";
print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n"
if ($version);
exit 1;
}
my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt";
my $underscore_version = $version;
$underscore_version =~ s/\./_/g;
my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__";
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Ignore;
/**
* This class was automatically generated by ${script_name}
* from: ${url}
*
* emoji-test.txt contains emoji char sequences, which are represented as
* tokenization tests in this class.
*
*/
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {
public void test(Analyzer analyzer) throws Exception {
for (int i = 0 ; i < tests.length ; i += 2) {
String test = tests[i + 1];
try {
assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
} catch (Throwable t) {
throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);
}
}
}
private String[] tests = new String[] {
__HEADER__
my @tests = split /\r?\n/, get_URL_content($url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
open OUT, ">$output_path"
|| die "Error opening '$output_path' for writing: $!";
print STDERR "Writing '$output_path'...";
print OUT $header;
my $isFirst = 1;
for my $line (@tests) {
next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
print OUT ",\n\n" unless $isFirst;
$isFirst = 0;
# Example line: 1F46E 1F3FB 200D 2642 FE0F ; fully-qualified # 👮🏻‍♂️ man police officer: light skin tone
$line =~ s/\s+$//; # Trim trailing whitespace
$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
print OUT " \"$line\",\n";
my ($test_string) = $line =~ /^(.*?)\s*;/;
$test_string =~ s/([0-9A-F]+)/\\u$1/g;
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
$test_string =~ s/\s//g;
print OUT " \"${test_string}\"";
}
print OUT " };\n}\n";
close OUT;
print STDERR "done.\n";
# sub above_BMP_char_to_surrogates
#
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
# to the corresponding UTF-16 surrogate pair
#
# Assumption: input string is a sequence more than four hex digits
#
sub above_BMP_char_to_surrogates {
my $ch = hex(shift);
my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
}
# sub get_URL_content
#
# Retrieves and returns the content of the given URL.
#
sub get_URL_content {
my $url = shift;
print STDERR "Retrieving '$url'...";
my $user_agent = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $url);
my $response = $user_agent->request($request);
unless ($response->is_success) {
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
exit 1;
}
print STDERR "done.\n";
return $response->content;
}

View File

@ -40,8 +40,6 @@ $underscore_version =~ s/\./_/g;
my $class_name = "WordBreakTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__";
package org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -59,6 +57,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Ignore;
@ -81,7 +81,7 @@ import org.junit.Ignore;
* \\p{WordBreak = Hebrew_Letter}
* \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {
@ -91,6 +91,7 @@ __HEADER__
my $codepoints = [];
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
my $regional_indicator_codepoints = [];
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
# Using lowercase versions of property value names to allow for case-
# insensitive comparison with the names in the Unicode data files.
@ -98,7 +99,9 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints,
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@ -124,10 +127,21 @@ for my $line (@tests) {
$test_string =~ s/\\u000D/\\r/g;
$test_string =~ s/\\u0022/\\\"/g;
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
# TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
# ÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
if ($sequence =~ /^200D\s*÷\s*261D$/) {
print OUT " // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
next;
}
my @tokens = ();
my $isfirst = 0;
for my $candidate (split /\s*÷\s*/, $sequence) {
$isfirst = 1;
my @chars = ();
my $has_wanted_char = 0;
my $has_wanted_chars = 0;
my $prev_char_regional_indicator = 0;
while ($candidate =~ /([0-9A-F]+)/gi) {
my $hexchar = $1;
if (4 == length($hexchar)) {
@ -135,12 +149,21 @@ for my $line (@tests) {
} else {
push @chars, above_BMP_char_to_surrogates($hexchar);
}
unless ($has_wanted_char) {
$has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
unless ($has_wanted_chars) {
my $codepoint = hex($hexchar);
if (defined($codepoints->[$codepoint])) {
$has_wanted_chars = 1;
} elsif (defined($regional_indicator_codepoints->[$codepoint])) {
if (1 == $prev_char_regional_indicator) {
$has_wanted_chars = 1; # must be 2 regional indicators in a row
} else {
$prev_char_regional_indicator = 1;
}
}
}
}
if ($has_wanted_char) {
push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
if ($has_wanted_chars) {
push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
}
}
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";