LUCENE-8527: Upgrade JFlex to 1.7.0. StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0, and provide UTS#51 v11.0 Emoji tokenization with the '<EMOJI>' token type.

2019-01-08 13:33:49 -05:00 · 2019-01-08 13:33:49 -05:00 · 283b19a8da
parent 7db4121b45
commit 283b19a8da
24 changed files with 57040 additions and 35638 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -241,6 +241,11 @@ Optimizations

 ======================= Lucene 7.7.0 =======================

+Changes in Runtime Behavior
+
+* LUCENE-8527: StandardTokenizer and UAX29URLEmailTokenizer now support Unicode 9.0,
+  and provide Unicode UTS#51 v11.0 Emoji tokenization with the "<EMOJI>" token type. 
+
 Build

 * LUCENE-8611: Update randomizedtesting to 2.7.2, JUnit to 4.12, add hamcrest-core 
@ -293,6 +298,9 @@ Improvements

 * LUCENE-8581: Change LatLonShape encoding to use 4 bytes Per Dimension.
  (Ignacio Vera, Nick Knize, Adrien Grand)
+  
+* LUCENE-8527: Upgrade JFlex dependency to 1.7.0; in StandardTokenizer and UAX29URLEmailTokenizer,
+  increase supported Unicode version from 6.3 to 9.0, and support Unicode UTS#51 v11.0 Emoji tokenization.

 Optimizations

--- a/lucene/analysis/common/build.xml
+++ b/lucene/analysis/common/build.xml
@ -33,18 +33,14 @@

  <property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>

-  <target name="jflex" depends="-install-jflex,clean-jflex,-jflex-ClassicAnalyzer,-jflex-UAX29URLEmailTokenizer,
-                                -jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
+  <!-- Because of a bug in JFlex's ant task, HTMLStripCharFilter has to be generated last.   -->
+  <!-- Otherwise the "%apiprivate" option used in its specification will leak into following -->
+  <!-- ant task invocations.                                                                 -->
+  <target name="jflex" depends="init,clean-jflex,-jflex-wiki-tokenizer,-jflex-ClassicAnalyzer,
+                                -jflex-UAX29URLEmailTokenizer,-jflex-HTMLStripCharFilter"/>

-  <target name="-jflex-HTMLStripCharFilter"
-          depends="init,generate-jflex-html-char-entities">
-    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
-           outdir="src/java/org/apache/lucene/analysis/charfilter"
-           nobak="on" inputstreamctor="false"/>
-    <!-- Remove the inappropriate JFlex-generated constructor -->
-    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
-                   match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
-                   replace="" flags="s"/>
+  <target name="-jflex-HTMLStripCharFilter" depends="-install-jflex,generate-jflex-html-char-entities">
+    <run-jflex dir="src/java/org/apache/lucene/analysis/charfilter" name="HTMLStripCharFilter"/>
  </target>

  <target name="generate-jflex-html-char-entities">
@ -58,17 +54,17 @@
    <fixcrlf file="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex" encoding="UTF-8"/>
  </target>

-  <target name="-jflex-wiki-tokenizer" depends="init,-install-jflex">
+  <target name="-jflex-wiki-tokenizer" depends="-install-jflex">
    <run-jflex dir="src/java/org/apache/lucene/analysis/wikipedia" name="WikipediaTokenizerImpl"/>
  </target>

-  <target name="-jflex-UAX29URLEmailTokenizer" depends="init,-install-jflex">
-    <run-jflex-and-disable-buffer-expansion
-        dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
+  <target name="-jflex-ClassicAnalyzer" depends="-install-jflex">
+    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
  </target>

-  <target name="-jflex-ClassicAnalyzer" depends="init,-install-jflex">
-    <run-jflex dir="src/java/org/apache/lucene/analysis/standard" name="ClassicTokenizerImpl"/>
+  <target name="-jflex-UAX29URLEmailTokenizer" depends="-install-jflex">
+    <run-jflex-and-disable-buffer-expansion
+        dir="src/java/org/apache/lucene/analysis/standard" name="UAX29URLEmailTokenizerImpl"/>
  </target>

  <target name="clean-jflex">
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
@SuppressWarnings("fallthrough")
 %%

-%unicode 6.3
+%unicode 9.0
 %apiprivate
 %type int
 %final
@ -50,6 +50,10 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
 %xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
 %xstate STYLE, STYLE_COMMENT

+%init{
+  super(in);
+%init}
+
 // From XML 1.0 <http://www.w3.org/TR/xml/>:
 //
 //    [4]  NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
@ -165,25 +169,15 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  private TextSegment outputSegment = inputSegment;
  private TextSegment entitySegment = new TextSegment(2);

-  /**
-   * Creates a new HTMLStripCharFilter over the provided Reader.
-   * @param source Reader to strip html tags from.
-   */
-  public HTMLStripCharFilter(Reader source) {
-    super(source);
-    this.zzReader = source;
-  }
-
  /**
   * Creates a new HTMLStripCharFilter over the provided Reader
   * with the specified start and end tags.
-   * @param source Reader to strip html tags from.
+   * @param in Reader to strip html tags from.
   * @param escapedTags Tags in this set (both start and end tags)
   *  will not be filtered out.
   */
-  public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
-    super(source);
-    this.zzReader = source;
+  public HTMLStripCharFilter(Reader in, Set<String> escapedTags) {
+    this(in);
    if (null != escapedTags) {
      for (String tag : escapedTags) {
        if (tag.equalsIgnoreCase("BR")) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -251,7 +251,7 @@ class ClassicTokenizerImpl {

  /* error messages for the codes above */
  private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
    "Error: could not match input",
    "Error: pushback value was too large"
  };
@ -323,11 +323,11 @@ class ClassicTokenizerImpl {
  private int yycolumn;

  /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
   */
  private boolean zzAtBOL = true;

-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
  private boolean zzAtEOF;

  /** denotes if the user-EOF-code has already been executed */
@ -436,28 +436,29 @@ public final void getText(CharTermAttribute t) {
    }

    /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
-    }
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);

-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
          --zzEndRead;
          zzFinalHighSurrogate = 1;
        }
      }
+      /* potentially more input available */
      return false;
    }

-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
    return true;
  }

@ -681,55 +682,65 @@ public final void getText(CharTermAttribute t) {
      // store back cached position
      zzMarkedPos = zzMarkedPosL;

-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { /* Break so we don't hit fall-through warning: */ break;/* ignore */
-          }
-        case 11: break;
-        case 2: 
-          { return ALPHANUM;
-          }
-        case 12: break;
-        case 3: 
-          { return CJ;
-          }
-        case 13: break;
-        case 4: 
-          { return HOST;
-          }
-        case 14: break;
-        case 5: 
-          { return NUM;
-          }
-        case 15: break;
-        case 6: 
-          { return APOSTROPHE;
-          }
-        case 16: break;
-        case 7: 
-          { return COMPANY;
-          }
-        case 17: break;
-        case 8: 
-          { return ACRONYM_DEP;
-          }
-        case 18: break;
-        case 9: 
-          { return ACRONYM;
-          }
-        case 19: break;
-        case 10: 
-          { return EMAIL;
-          }
-        case 20: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
-            return YYEOF;
-          } 
-          else {
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+        return YYEOF;
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+            } 
+            // fall through
+          case 11: break;
+          case 2: 
+            { return ALPHANUM;
+            } 
+            // fall through
+          case 12: break;
+          case 3: 
+            { return CJ;
+            } 
+            // fall through
+          case 13: break;
+          case 4: 
+            { return HOST;
+            } 
+            // fall through
+          case 14: break;
+          case 5: 
+            { return NUM;
+            } 
+            // fall through
+          case 15: break;
+          case 6: 
+            { return APOSTROPHE;
+            } 
+            // fall through
+          case 16: break;
+          case 7: 
+            { return COMPANY;
+            } 
+            // fall through
+          case 17: break;
+          case 8: 
+            { return ACRONYM_DEP;
+            } 
+            // fall through
+          case 18: break;
+          case 9: 
+            { return ACRONYM;
+            } 
+            // fall through
+          case 19: break;
+          case 10: 
+            { return EMAIL;
+            } 
+            // fall through
+          case 20: break;
+          default:
            zzScanError(ZZ_NO_MATCH);
-          }
+        }
      }
    }
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
@ -32,33 +32,32 @@ import org.apache.lucene.util.AttributeFactory;
 * algorithm, as specified in 
 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
 * URLs and email addresses are also tokenized according to the relevant RFCs.
- * <p>
- * Tokens produced are of the following types:
- * <ul>
- *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
- *   <li>&lt;NUM&gt;: A number</li>
- *   <li>&lt;URL&gt;: A URL</li>
- *   <li>&lt;EMAIL&gt;: An email address</li>
- *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
- *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
- *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
- *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
- * </ul>
 */

 public final class UAX29URLEmailTokenizer extends Tokenizer {
  /** A private instance of the JFlex-constructed scanner */
  private final UAX29URLEmailTokenizerImpl scanner;
-  
-  public static final int ALPHANUM          = 0;
-  public static final int NUM               = 1;
-  public static final int SOUTHEAST_ASIAN   = 2;
-  public static final int IDEOGRAPHIC       = 3;
-  public static final int HIRAGANA          = 4;
-  public static final int KATAKANA          = 5;
-  public static final int HANGUL            = 6;
-  public static final int URL               = 7;
-  public static final int EMAIL             = 8;
+
+  /** Alpha/numeric token type */
+  public static final int ALPHANUM = 0;
+  /** Numeric token type */
+  public static final int NUM = 1;
+  /** Southeast Asian token type */
+  public static final int SOUTHEAST_ASIAN = 2;
+  /** Ideographic token type */
+  public static final int IDEOGRAPHIC = 3;
+  /** Hiragana token type */
+  public static final int HIRAGANA = 4;
+  /** Katakana token type */
+  public static final int KATAKANA = 5;
+  /** Hangul token type */
+  public static final int HANGUL = 6;
+  /** URL token type */
+  public static final int URL = 7;
+  /** Email token type */
+  public static final int EMAIL = 8;
+  /** Emoji token type. */
+  public static final int EMOJI = 9;

  /** String token types that correspond to token type int constants */
  public static final String [] TOKEN_TYPES = new String [] {
@ -71,6 +70,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
    StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
    "<URL>",
    "<EMAIL>",
+    StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]
  };

  /** Absolute maximum sized token */
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
@ -37,12 +37,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
 *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
 *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ *   <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
 * </ul>
 */
@SuppressWarnings("fallthrough")
 %%

-%unicode 6.3
+%unicode 9.0
 %integer
 %final
 %public
@ -52,22 +53,73 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %xstate AVOID_BAD_URL
 %buffer 255

-// UAX#29 WB4. X (Extend | Format)* --> X
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
 //
-HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx   = [\p{WB:HebrewLetter}\p{WB:ALetter}]                       [\p{WB:Format}\p{WB:Extend}]*
-NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx          = \p{WB:Katakana}                                           [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       [\p{WB:Format}\p{WB:Extend}]*
-HanEx               = \p{Script:Han}                                            [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx          = \p{Script:Hiragana}                                       [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx       = \p{WB:Single_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx       = \p{WB:Double_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator}                                  [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx    = \p{LB:Complex_Context}                                    [\p{WB:Format}\p{WB:Extend}]*
+ExtFmtZwj           = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+
+//////////////////////////////////////////////////////////////////////////
+// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
+
+// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
+%include ../../../../../../../../../core/src/data/jflex/UnicodeEmojiProperties.jflex
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+//   \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
+//   - are explicitly excluded here so that we can properly handle Emoji sequences.
+//
+ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
+
+KeyCapBaseChar = [0-9#*]
+KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
+KeyCap = \u20E3
+KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
+
+// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
+AccidentalEmoji = [©®™\u3030\u303D]
+EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
+
+// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
+// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
+// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
+EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
+
+EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
+
+EmojiCharEx         = {EmojiChar}           {ExtFmtZwjSansPresSel}
+EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
+EmojiModifierEx     = {Emoji_Modifier}      {ExtFmtZwjSansPresSel}
+
+EmojiPresentationSelector = \uFE0F
+EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
+TagSpec = [\u{E0020}-\u{E007E}]
+TagTerm = \u{E007F}
+
+// End Emoji Macros
+//////////////////////////////////////////////////////////////////////////
+
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+ExtFmtZwj           = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
+AHLetterEx          = [\p{WB:ALetter}\p{WB:Hebrew_Letter}]                      {ExtFmtZwj}
+NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        {ExtFmtZwj}
+KatakanaEx          = \p{WB:Katakana}                                           {ExtFmtZwj} 
+MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      {ExtFmtZwj} 
+MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         {ExtFmtZwj}
+ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       {ExtFmtZwj}
+HanEx               = \p{Script:Han}                                            {ExtFmtZwj}
+HiraganaEx          = \p{Script:Hiragana}                                       {ExtFmtZwj}
+SingleQuoteEx       = \p{WB:Single_Quote}                                       {ExtFmtZwj}
+DoubleQuoteEx       = \p{WB:Double_Quote}                                       {ExtFmtZwj}
+HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      {ExtFmtZwj}
+RegionalIndicatorEx = \p{WB:Regional_Indicator}                                 {ExtFmtZwj}
+ComplexContextEx    = \p{LB:Complex_Context}                                    {ExtFmtZwj}
+

 // URL and E-mail syntax specifications:
 //
@ -174,18 +226,28 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
   */
  public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
  
+  /** Ideographic token type */
  public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
  
+  /** Hiragana token type */
  public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
  
+  /** Katakana token type */
  public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
  
+  /** Hangul token type */
  public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
  
+  /** Email token type */
  public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
  
+  /** URL token type */
  public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;

+  /** Emoji token type */
+  public static final int EMOJI_TYPE = UAX29URLEmailTokenizer.EMOJI;
+
+  /** Character count processed so far */
  public final int yychar()
  {
    return yychar;
@ -213,11 +275,11 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})

 <YYINITIAL, AVOID_BAD_URL> {

-// UAX#29 WB1.   sot   ÷
-//        WB2.     ÷   eot
+// UAX#29 WB1.    sot ÷ Any
+//        WB2.    Any ÷ eot
 //
  <<EOF>> { return YYEOF; }
-
+  
  {URL}   { yybegin(YYINITIAL); return URL_TYPE; }

  // LUCENE-5391: Don't recognize no-scheme domain-only URLs with a following alphanumeric character
@ -244,14 +306,61 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})

  {EMAIL} { yybegin(YYINITIAL); return EMAIL_TYPE; }

-  // UAX#29 WB8.   Numeric × Numeric
-  //        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
-  //        WB12.  Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
-  //        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-  //        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+
+  // Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
+  //                          WB14. (E_Base | EBG) × E_Modifier
+  //                          WB15. ^ (RI RI)* RI × RI
+  //                          WB16. [^RI] (RI RI)* RI × RI
+  //
+  // We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
+  // and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
+  // 
+  // emoji_sequence :=
+  //    Top-level EBNF           Expanded #1                       Expanded #2                       Expanded #3
+  //    ---------------------    ----------------------------      -----------------------------     ----------------------------------------------
+  //      emoji_core_sequence      emoji_combining_sequence          emoji_character                 ( \p{Emoji}
+  //                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+  //                                                               | emoji_keycap_sequence           | [0-9#*] \u{FE0F 20E3}      [1]
+  //                             | emoji_modifier_sequence                                           | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
+  //                             | emoji_flag_sequence                                               | \p{WB:Regional_Indicator}{2}               )
+  //
+  //    | emoji_zwj_sequence       emoji_zwj_element                 emoji_character                 ( \p{Emoji}
+  //                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+  //                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+  //                             ( ZWJ emoji_zwj_element )+                                          ( \p{WB:ZWJ} ^^ )+
+  // 
+  //    | emoji_tag_sequence     tag_base                            emoji_character                 ( \p{Emoji}
+  //                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+  //                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+  //                             tag_spec                                                            [\u{E0020}-\u{E007E}]+
+  //                             tag_term                                                            \u{E007F}
+  //
+  // [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences 
+  //     WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
+  //     TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
+  //     choose whether to support them for segmentation.  This implementation will
+  //     recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji. 
+  //
+  // See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
+  //           https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
+  //
+  //     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
+  //
+  //         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+  //
+    {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
+  | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
+  | {RegionalIndicatorEx}{2} 
+    { yybegin(YYINITIAL); return EMOJI_TYPE; }
+
+  // UAX#29 WB8.    Numeric × Numeric
+  //        WB11.   Numeric (MidNum | MidNumLetQ) × Numeric
+  //        WB12.   Numeric × (MidNum | MidNumLetQ) Numeric
+  //        WB13a.  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+  //        WB13b.  ExtendNumLet × (AHLetter | Numeric | Katakana)
  //
  {ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
-    {  yybegin(YYINITIAL); return NUMERIC_TYPE; }
+    { yybegin(YYINITIAL); return NUMERIC_TYPE; }

  // subset of the below for typing purposes only!
  {HangulEx}+
@ -260,32 +369,32 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
  {KatakanaEx}+
    { yybegin(YYINITIAL); return KATAKANA_TYPE; }

-  // UAX#29 WB5.   (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
-  //        WB6.   (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-  //        WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
-  //        WB7a.  Hebrew_Letter × Single_Quote
-  //        WB7b.  Hebrew_Letter × Double_Quote Hebrew_Letter
-  //        WB7c.  Hebrew_Letter Double_Quote × Hebrew_Letter
-  //        WB9.   (ALetter | Hebrew_Letter) × Numeric
-  //        WB10.  Numeric × (ALetter | Hebrew_Letter)
-  //        WB13.  Katakana × Katakana
-  //        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-  //        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+  // UAX#29 WB5.    AHLetter × AHLetter
+  //        WB6.    AHLetter × (MidLetter | MidNumLetQ) AHLetter
+  //        WB7.    AHLetter (MidLetter | MidNumLetQ) × AHLetter
+  //        WB7a.   Hebrew_Letter × Single_Quote
+  //        WB7b.   Hebrew_Letter × Double_Quote Hebrew_Letter
+  //        WB7c.   Hebrew_Letter Double_Quote × Hebrew_Letter
+  //        WB9.    AHLetter × Numeric
+  //        WB10.   Numeric × AHLetter
+  //        WB13.   Katakana × Katakana
+  //        WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+  //        WB13b.  ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
  //
-  {ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                       | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+  {ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                       | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                       )+
                     )
-  ({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                       | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+  ({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                     | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                       | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                       | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                       )+
                     )
  )*
-  {ExtendNumLetEx}*
+  {ExtendNumLetEx}* 
    { yybegin(YYINITIAL); return WORD_TYPE; }


@ -297,7 +406,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
  //    annex.  That means that satisfactory treatment of languages like Chinese
  //    or Thai requires special handling.
  //
-  // In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+  // In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
  // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
  //
  // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -310,18 +419,15 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
  //
  {ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }

-  // UAX#29 WB14.  Any ÷ Any
+  // UAX#29 WB999.  Any ÷ Any
  //
  {HanEx} { yybegin(YYINITIAL); return IDEOGRAPHIC_TYPE; }
  {HiraganaEx} { yybegin(YYINITIAL); return HIRAGANA_TYPE; }

-
-  // UAX#29 WB3.   CR × LF
-  //        WB3a.  (Newline | CR | LF) ÷
-  //        WB3b.  ÷ (Newline | CR | LF)
-  //        WB13c. Regional_Indicator × Regional_Indicator
-  //        WB14.  Any ÷ Any
+  // UAX#29 WB3.    CR × LF
+  //        WB3a.   (Newline | CR | LF) ÷
+  //        WB3b.   ÷ (Newline | CR | LF)
+  //        WB999.  Any ÷ Any
  //
-  {RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
-    { yybegin(YYINITIAL); /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+  [^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -341,7 +341,7 @@ class WikipediaTokenizerImpl {

  /* error messages for the codes above */
  private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
    "Error: could not match input",
    "Error: pushback value was too large"
  };
@ -419,11 +419,11 @@ class WikipediaTokenizerImpl {
  private int yycolumn;

  /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
   */
  private boolean zzAtBOL = true;

-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
  private boolean zzAtEOF;

  /** denotes if the user-EOF-code has already been executed */
@ -575,28 +575,29 @@ final void reset() {
    }

    /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
-    }
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);

-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
          --zzEndRead;
          zzFinalHighSurrogate = 1;
        }
      }
+      /* potentially more input available */
      return false;
    }

-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
    return true;
  }

@ -820,199 +821,245 @@ final void reset() {
      // store back cached position
      zzMarkedPos = zzMarkedPosL;

-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 47: break;
-        case 2: 
-          { positionInc = 1; return ALPHANUM;
-          }
-        case 48: break;
-        case 3: 
-          { positionInc = 1; return CJ;
-          }
-        case 49: break;
-        case 4: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 50: break;
-        case 5: 
-          { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 51: break;
-        case 6: 
-          { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
-          }
-        case 52: break;
-        case 7: 
-          { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
-          }
-        case 53: break;
-        case 8: 
-          { /* Break so we don't hit fall-through warning: */ break;/* ignore */
-          }
-        case 54: break;
-        case 9: 
-          { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
-          }
-        case 55: break;
-        case 10: 
-          { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 56: break;
-        case 11: 
-          { currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 57: break;
-        case 12: 
-          { currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/
-          }
-        case 58: break;
-        case 13: 
-          { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 59: break;
-        case 14: 
-          { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
-          }
-        case 60: break;
-        case 15: 
-          { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 61: break;
-        case 16: 
-          { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
-          }
-        case 62: break;
-        case 17: 
-          { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
-          }
-        case 63: break;
-        case 18: 
-          { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
-          }
-        case 64: break;
-        case 19: 
-          { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
-          }
-        case 65: break;
-        case 20: 
-          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 66: break;
-        case 21: 
-          { yybegin(STRING); return currentTokType;/*pipe*/
-          }
-        case 67: break;
-        case 22: 
-          { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 68: break;
-        case 23: 
-          { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 69: break;
-        case 24: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 70: break;
-        case 25: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 71: break;
-        case 26: 
-          { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 72: break;
-        case 27: 
-          { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 73: break;
-        case 28: 
-          { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 74: break;
-        case 29: 
-          { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 75: break;
-        case 30: 
-          { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 76: break;
-        case 31: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
-          }
-        case 77: break;
-        case 32: 
-          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 78: break;
-        case 33: 
-          { positionInc = 1; return APOSTROPHE;
-          }
-        case 79: break;
-        case 34: 
-          { positionInc = 1; return HOST;
-          }
-        case 80: break;
-        case 35: 
-          { positionInc = 1; return NUM;
-          }
-        case 81: break;
-        case 36: 
-          { positionInc = 1; return COMPANY;
-          }
-        case 82: break;
-        case 37: 
-          { currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 83: break;
-        case 38: 
-          { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
-          }
-        case 84: break;
-        case 39: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
-          }
-        case 85: break;
-        case 40: 
-          { positionInc = 1; return ACRONYM;
-          }
-        case 86: break;
-        case 41: 
-          { positionInc = 1; return EMAIL;
-          }
-        case 87: break;
-        case 42: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
-          }
-        case 88: break;
-        case 43: 
-          { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
-          }
-        case 89: break;
-        case 44: 
-          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
-          }
-        case 90: break;
-        case 45: 
-          { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 91: break;
-        case 46: 
-          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
-          }
-        case 92: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
-            return YYEOF;
-          } 
-          else {
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+        return YYEOF;
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 47: break;
+          case 2: 
+            { positionInc = 1; return ALPHANUM;
+            } 
+            // fall through
+          case 48: break;
+          case 3: 
+            { positionInc = 1; return CJ;
+            } 
+            // fall through
+          case 49: break;
+          case 4: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 50: break;
+          case 5: 
+            { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 51: break;
+          case 6: 
+            { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 52: break;
+          case 7: 
+            { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 53: break;
+          case 8: 
+            { /* Break so we don't hit fall-through warning: */ break;/* ignore */
+            } 
+            // fall through
+          case 54: break;
+          case 9: 
+            { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
+            } 
+            // fall through
+          case 55: break;
+          case 10: 
+            { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 56: break;
+          case 11: 
+            { currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 57: break;
+          case 12: 
+            { currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/
+            } 
+            // fall through
+          case 58: break;
+          case 13: 
+            { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 59: break;
+          case 14: 
+            { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 60: break;
+          case 15: 
+            { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 61: break;
+          case 16: 
+            { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
+            } 
+            // fall through
+          case 62: break;
+          case 17: 
+            { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
+            } 
+            // fall through
+          case 63: break;
+          case 18: 
+            { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
+            } 
+            // fall through
+          case 64: break;
+          case 19: 
+            { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
+            } 
+            // fall through
+          case 65: break;
+          case 20: 
+            { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 66: break;
+          case 21: 
+            { yybegin(STRING); return currentTokType;/*pipe*/
+            } 
+            // fall through
+          case 67: break;
+          case 22: 
+            { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 68: break;
+          case 23: 
+            { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 69: break;
+          case 24: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 70: break;
+          case 25: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 71: break;
+          case 26: 
+            { yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 72: break;
+          case 27: 
+            { numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 73: break;
+          case 28: 
+            { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 74: break;
+          case 29: 
+            { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 75: break;
+          case 30: 
+            { yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 76: break;
+          case 31: 
+            { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
+            } 
+            // fall through
+          case 77: break;
+          case 32: 
+            { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 78: break;
+          case 33: 
+            { positionInc = 1; return APOSTROPHE;
+            } 
+            // fall through
+          case 79: break;
+          case 34: 
+            { positionInc = 1; return HOST;
+            } 
+            // fall through
+          case 80: break;
+          case 35: 
+            { positionInc = 1; return NUM;
+            } 
+            // fall through
+          case 81: break;
+          case 36: 
+            { positionInc = 1; return COMPANY;
+            } 
+            // fall through
+          case 82: break;
+          case 37: 
+            { currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 83: break;
+          case 38: 
+            { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
+            } 
+            // fall through
+          case 84: break;
+          case 39: 
+            { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
+            } 
+            // fall through
+          case 85: break;
+          case 40: 
+            { positionInc = 1; return ACRONYM;
+            } 
+            // fall through
+          case 86: break;
+          case 41: 
+            { positionInc = 1; return EMAIL;
+            } 
+            // fall through
+          case 87: break;
+          case 42: 
+            { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
+            } 
+            // fall through
+          case 88: break;
+          case 43: 
+            { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+            } 
+            // fall through
+          case 89: break;
+          case 44: 
+            { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 90: break;
+          case 45: 
+            { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 91: break;
+          case 46: 
+            { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
+            } 
+            // fall through
+          case 92: break;
+          default:
            zzScanError(ZZ_NO_MATCH);
-          }
+        }
      }
    }
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -499,7 +499,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {

    String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't include a closing ">"
        = TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
-    String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
+    String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString2 +"-[CDATA[";

    String[] testGold = {
        "one<![CDATA[<one><two>three<four></four></two></one>]]>two",
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailAnalyzer.java
@ -361,14 +361,14 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {

    StringBuilder bToken = new StringBuilder();
    // exact max length:
-    for(int i=0;i<StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
+    for(int i=0;i<UAX29URLEmailAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;i++) {
      bToken.append('b');
    }

    String bString = bToken.toString();
    // first bString is exact max default length; next one is 1 too long
    String input = "x " + bString + " " + bString + "b";
-    assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
+    assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"});
    a.close();
  }

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizer.java
@ -467,7 +467,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
  }

  public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+    WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
    wordBreakTest.test(a);
  }
  
@ -545,6 +545,80 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
  }


+  /** simple emoji */
+  public void testEmoji() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+        new String[] { "💩", "💩", "💩" },
+        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** emoji zwj sequence */
+  public void testEmojiSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
+        new String[] { "👩‍❤️‍👩" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** emoji zwj sequence with fitzpatrick modifier */
+  public void testEmojiSequenceWithModifier() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
+        new String[] { "👨🏼‍⚕️" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** regional indicator */
+  public void testEmojiRegionalIndicator() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+        new String[] { "🇺🇸", "🇺🇸" },
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** variation sequence */
+  public void testEmojiVariationSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+        new String[] { "#️⃣" },
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+        new String[] { "3️⃣",},
+        new String[] { "<EMOJI>" });
+
+    // text presentation sequences
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
+        new String[] { },
+        new String[] { });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E",  // \uFE0E is included in \p{WB:Extend}
+        new String[] { "3\uFE0E",},
+        new String[] { "<NUM>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E",     // \u2B55 = HEAVY BLACK CIRCLE
+        new String[] { "\u2B55",},
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+        new String[] { "\u2B55", "\u200D\u2B55"},
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  public void testEmojiTagSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
+        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
+        new String[] { "<EMOJI>" });
+  }
+
+  public void testEmojiTokenization() throws Exception {
+    // simple emoji around latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+        new String[] { "poo", "💩", "poo" },
+        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+    // simple emoji around non-latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+        new String[] { "💩", "中", "國", "💩" },
+        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+  }
+
+  public void testUnicodeEmojiTests() throws Exception {
+    EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
+    emojiTest.test(a);
+  }
+
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@ -2388,7 +2388,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}

  <!-- JFlex task -->
  <target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
-    <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
+    <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.7.0"
                   inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
    <taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
    <property name="jflex.loaded" value="true"/>
@ -2645,7 +2645,11 @@ The following arguments can be provided to ant to alter its behaviour and target
    <attribute name="dir"/>
    <attribute name="name"/>
    <sequential>
-      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
+      <!-- The default skeleton is specified here to work around a JFlex ant task bug:    -->
+      <!-- invocations with a non-default skeleton will cause following invocations to    -->
+      <!-- use the same skeleton, though not specified, unless the default is configured. -->
+      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
+             skeleton="${common.dir}/core/src/data/jflex/skeleton.default"/>
    </sequential>
  </macrodef>

@ -2653,20 +2657,13 @@ The following arguments can be provided to ant to alter its behaviour and target
    <attribute name="dir"/>
    <attribute name="name"/>
    <sequential>
-      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
      <!-- LUCENE-5897: Disallow scanner buffer expansion -->
-      <replaceregexp file="@{dir}/@{name}.java"
-                     match="[ \t]*/\* is the buffer big enough\? \*/\s+if \(zzCurrentPos >= zzBuffer\.length.*?\}[ \t]*\r?\n"
-                     replace="" flags="s" />
+      <jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on"
+             skeleton="${common.dir}/core/src/data/jflex/skeleton.disable.buffer.expansion.txt"/>
+      <!-- Since the ZZ_BUFFERSIZE declaration is generated rather than in the skeleton, we have to transform it here. -->
      <replaceregexp file="@{dir}/@{name}.java"
                     match="private static final int ZZ_BUFFERSIZE ="
                     replace="private int ZZ_BUFFERSIZE ="/>
-      <replaceregexp file="@{dir}/@{name}.java"
-                     match="int requested = zzBuffer.length - zzEndRead;"
-                     replace="int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;"/>
-      <replaceregexp file="@{dir}/@{name}.java"
-                     match="(zzFinalHighSurrogate = 1;)(\r?\n)"
-                     replace="\1\2          if (totalRead == 1) { return true; }\2"/>
    </sequential>
  </macrodef>

--- a/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
+++ b/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file was automatically generated by getUnicodeEmojiProperties.pl
+// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt 
+
+Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
+Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
+Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
+Extended_Pictographic = [\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{2388}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2605}\u{2607}-\u{2612}\u{2614}-\u{2685}\u{2690}-\u{2705}\u{2708}-\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2767}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F000}-\u{1F0FF}\u{1F10D}-\u{1F10F}\u{1F12F}\u{1F16C}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AD}-\u{1F1E5}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F3FA}\u{1F400}-\u{1F53D}\u{1F546}-\u{1F64F}\u{1F680}-\u{1F6FF}\u{1F774}-\u{1F77F}\u{1F7D5}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1FFFD}]
+
--- a/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
+++ b/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
@ -0,0 +1,168 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
+    print STDERR "Usage: $script_name -v <version>\n";
+    print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
+        if ($version);
+    exit 1;
+}
+my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
+my $output_filename = "UnicodeEmojiProperties.jflex";
+my $header =<<"__HEADER__";
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file was automatically generated by ${script_name}
+// from: ${emoji_data_url} 
+
+__HEADER__
+
+my $property_ranges = {};
+my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
+
+parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+output_jflex_include_file($output_path, $property_ranges);
+
+
+# sub parse_emoji_data_file
+#
+# Downloads and parses the emoji_data.txt file, extracting code point ranges
+# assigned to property values with age not younger than the passed-in version,
+# except for the Extended_Pictographic property, for which all code point ranges
+# are extracted, regardless of age.
+#
+# Parameters:
+#
+#  - Emoji data file URL
+#  - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
+#  - Reference to hash of wanted property names
+#
+sub parse_emoji_data_file {
+    my $url = shift;
+    my $prop_ranges = shift;
+    my $wanted_props = shift;
+    my $content = get_URL_content($url);
+    print STDERR "Parsing '$url'...";
+    my @lines = split /\r?\n/, $content;
+    for (@lines) {
+        ## 231A..231B    ; Emoji_Presentation   #  1.1  [2] (⌚..⌛)    watch..hourglass done
+        ## 1F9C0         ; Emoji_Presentation   #  8.0  [1] (🧀)       cheese wedge
+        ## 1FA00..1FA5F  ; Extended_Pictographic#   NA [96] (🨀️..🩟️)    <reserved-1FA00>..<reserved-1FA5F>
+        if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
+            next unless defined($wanted_props->{$prop});  # Skip unless we want ranges for this property
+            
+            if (not defined($prop_ranges->{$prop})) {
+                $prop_ranges->{$prop} = [];
+            }
+            $end = $start unless defined($end);
+            my $start_dec = hex $start;
+            my $end_dec = hex $end;
+            my $ranges = $prop_ranges->{$prop};
+            if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
+                # print STDERR "Adding new range ($start, $end)\n";
+                push @$ranges, $start_dec, $end_dec;
+            } else {
+                # printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
+                $ranges->[-1] = $end_dec;
+            }
+        } else {
+            # print STDERR "Skipping line (no data): $_\n";
+        }
+    }
+    print STDERR "done.\n";
+}
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+# Parameter:
+#
+#  - URL to get content for
+#
+sub get_URL_content {
+    my $url = shift;
+    print STDERR "Retrieving '$url'...";
+    my $user_agent = LWP::UserAgent->new;
+    my $request = HTTP::Request->new(GET => $url);
+    my $response = $user_agent->request($request);
+    unless ($response->is_success) {
+        print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+        exit 1;
+    }
+    print STDERR "done.\n";
+    return $response->content;
+}
+
+
+# sub output_jflex_include_file
+#
+# Parameters:
+#
+#  - Output path
+#  - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
+#     
+sub output_jflex_include_file {
+    my $path = shift;
+    my $prop_ranges = shift;
+    open OUT, ">$path"
+        || die "Error opening '$path' for writing: $!";
+
+    print STDERR "Writing '$path'...";
+
+    print OUT $header;
+
+    for my $prop (sort keys %$prop_ranges) {
+        my $ranges = $prop_ranges->{$prop};
+        print OUT "$prop = [";
+        for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
+            printf OUT "\\u{%X}", $ranges->[$index];
+            printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
+        }
+        print OUT "]\n";
+    }
+
+    print OUT "\n";
+    close OUT;
+    print STDERR "done.\n";
+}
--- a/lucene/core/src/data/jflex/skeleton.default
+++ b/lucene/core/src/data/jflex/skeleton.default
@ -0,0 +1,342 @@
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+--- private static final int ZZ_BUFFERSIZE = ...;
+
+  /** lexical states */
+---  lexical states, charmap
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unknown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+--- isFinal list
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true iff the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+  
+  /** 
+   * The number of occupied positions in zzBuffer beyond zzEndRead.
+   * When a lead/high surrogate has been read from the input stream
+   * into the final zzBuffer position, this will have a value of 1;
+   * otherwise, it will have a value of 0.
+   */
+  private int zzFinalHighSurrogate = 0;
+
+--- user class code
+
+--- constructor declaration
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzBuffer.length*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+    }
+
+    /* fill the buffer with new input */
+    int requested = zzBuffer.length - zzEndRead;
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
+
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      /* If numRead == requested, we might have requested to few chars to
+         encode a full Unicode character. We assume that a Reader would
+         otherwise never return half characters. */
+      if (numRead == requested) {
+        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+          --zzEndRead;
+          zzFinalHighSurrogate = 1;
+        }
+      }
+      /* potentially more input available */
+      return false;
+    }
+
+    /* numRead < 0 ==> end of stream */
+    return true;
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * Internal scan buffer is resized down to its initial length, if it has grown.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    zzFinalHighSurrogate = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+    if (zzBuffer.length > ZZ_BUFFERSIZE)
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+--- zzScanError declaration
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+--- throws clause
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+--- yypushback decl (contains zzScanError exception)
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+--- zzDoEOF
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+--- yylex declaration
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+--- local declarations
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+--- start admin (line, char, col count)
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+--- start admin (lexstate etc)
+
+      zzForAction: {
+        while (true) {
+    
+--- next input, line, col, char count, next transition, isFinal action
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+--- line count update
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+--- char count update
+
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+--- eofvalue
+      }
+      else {
+--- actions
+          default:
+--- no match
+        }
+      }
+    }
+  }
+
+--- main
+
+}
--- a/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
+++ b/lucene/core/src/data/jflex/skeleton.disable.buffer.expansion.txt
@ -0,0 +1,348 @@
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+--- private static final int ZZ_BUFFERSIZE = ...;
+
+  /** lexical states */
+---  lexical states, charmap
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unknown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+--- isFinal list
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true iff the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+  
+  /** 
+   * The number of occupied positions in zzBuffer beyond zzEndRead.
+   * When a lead/high surrogate has been read from the input stream
+   * into the final zzBuffer position, this will have a value of 1;
+   * otherwise, it will have a value of 0.
+   */
+  private int zzFinalHighSurrogate = 0;
+
+--- user class code
+
+--- constructor declaration
+
+/* -------------------------------------------------------------------------------- */
+/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+
+    /* fill the buffer with new input */
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+    if (requested == 0) {
+      return true;
+    }
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);
+
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+        if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
+          --zzEndRead;
+          zzFinalHighSurrogate = 1;
+          if (numRead == 1) {
+            return true;
+          }
+        } else {                    // There is room in the buffer for at least one more char
+          int c = zzReader.read();  // Expecting to read a low surrogate char
+          if (c == -1) {
+            return true;
+          } else {
+            zzBuffer[zzEndRead++] = (char)c;
+            return false;
+          }
+        }
+      }
+      /* potentially more input available */
+      return false;
+    }
+
+    /* numRead < 0 ==> end of stream */
+    return true;
+  }
+
+/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+/* ------------------------------------------------------------------------------ */
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * Internal scan buffer is resized down to its initial length, if it has grown.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    zzFinalHighSurrogate = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+    if (zzBuffer.length > ZZ_BUFFERSIZE)
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+--- zzScanError declaration
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+--- throws clause
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+--- yypushback decl (contains zzScanError exception)
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+--- zzDoEOF
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+--- yylex declaration
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+--- local declarations
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+--- start admin (line, char, col count)
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+--- start admin (lexstate etc)
+
+      zzForAction: {
+        while (true) {
+    
+--- next input, line, col, char count, next transition, isFinal action
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+--- line count update
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+--- char count update
+
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
+--- eofvalue
+      }
+      else {
+--- actions
+          default:
+--- no match
+        }
+      }
+    }
+  }
+
+--- main
+
+}
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.6.0 */
+/* The following code was generated by JFlex 1.7.0 */

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
 *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
 *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ *   <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
 * </ul>
 */
@SuppressWarnings("fallthrough")
@ -65,147 +66,212 @@ public final class StandardTokenizerImpl {
   * Translates characters to character classes
   */
  private static final String ZZ_CMAP_PACKED = 
-    "\42\0\1\15\4\0\1\14\4\0\1\7\1\0\1\10\1\0\12\4"+
-    "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\57\0\1\1"+
-    "\2\0\1\3\7\0\1\1\1\0\1\6\2\0\1\1\5\0\27\1"+
-    "\1\0\37\1\1\0\u01ca\1\4\0\14\1\5\0\1\6\10\0\5\1"+
-    "\7\0\1\1\1\0\1\1\21\0\160\3\5\1\1\0\2\1\2\0"+
-    "\4\1\1\7\7\0\1\1\1\6\3\1\1\0\1\1\1\0\24\1"+
-    "\1\0\123\1\1\0\213\1\1\0\7\3\236\1\11\0\46\1\2\0"+
-    "\1\1\7\0\47\1\1\0\1\7\7\0\55\3\1\0\1\3\1\0"+
-    "\2\3\1\0\2\3\1\0\1\3\10\0\33\16\5\0\3\16\1\1"+
-    "\1\6\13\0\5\3\7\0\2\7\2\0\13\3\1\0\1\3\3\0"+
-    "\53\1\25\3\12\4\1\0\1\4\1\7\1\0\2\1\1\3\143\1"+
-    "\1\0\1\1\10\3\1\0\6\3\2\1\2\3\1\0\4\3\2\1"+
-    "\12\4\3\1\2\0\1\1\17\0\1\3\1\1\1\3\36\1\33\3"+
-    "\2\0\131\1\13\3\1\1\16\0\12\4\41\1\11\3\2\1\2\0"+
-    "\1\7\1\0\1\1\5\0\26\1\4\3\1\1\11\3\1\1\3\3"+
-    "\1\1\5\3\22\0\31\1\3\3\104\0\1\1\1\0\13\1\67\0"+
-    "\33\3\1\0\4\3\66\1\3\3\1\1\22\3\1\1\7\3\12\1"+
-    "\2\3\2\0\12\4\1\0\7\1\1\0\7\1\1\0\3\3\1\0"+
-    "\10\1\2\0\2\1\2\0\26\1\1\0\7\1\1\0\1\1\3\0"+
-    "\4\1\2\0\1\3\1\1\7\3\2\0\2\3\2\0\3\3\1\1"+
-    "\10\0\1\3\4\0\2\1\1\0\3\1\2\3\2\0\12\4\2\1"+
-    "\17\0\3\3\1\0\6\1\4\0\2\1\2\0\26\1\1\0\7\1"+
-    "\1\0\2\1\1\0\2\1\1\0\2\1\2\0\1\3\1\0\5\3"+
-    "\4\0\2\3\2\0\3\3\3\0\1\3\7\0\4\1\1\0\1\1"+
-    "\7\0\12\4\2\3\3\1\1\3\13\0\3\3\1\0\11\1\1\0"+
-    "\3\1\1\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0"+
-    "\1\3\1\1\10\3\1\0\3\3\1\0\3\3\2\0\1\1\17\0"+
-    "\2\1\2\3\2\0\12\4\21\0\3\3\1\0\10\1\2\0\2\1"+
-    "\2\0\26\1\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\3"+
-    "\1\1\7\3\2\0\2\3\2\0\3\3\10\0\2\3\4\0\2\1"+
-    "\1\0\3\1\2\3\2\0\12\4\1\0\1\1\20\0\1\3\1\1"+
-    "\1\0\6\1\3\0\3\1\1\0\4\1\3\0\2\1\1\0\1\1"+
-    "\1\0\2\1\3\0\2\1\3\0\3\1\3\0\14\1\4\0\5\3"+
-    "\3\0\3\3\1\0\4\3\2\0\1\1\6\0\1\3\16\0\12\4"+
-    "\21\0\3\3\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1"+
-    "\1\0\5\1\3\0\1\1\7\3\1\0\3\3\1\0\4\3\7\0"+
-    "\2\3\1\0\2\1\6\0\2\1\2\3\2\0\12\4\22\0\2\3"+
-    "\1\0\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1"+
-    "\2\0\1\3\1\1\7\3\1\0\3\3\1\0\4\3\7\0\2\3"+
-    "\7\0\1\1\1\0\2\1\2\3\2\0\12\4\1\0\2\1\17\0"+
-    "\2\3\1\0\10\1\1\0\3\1\1\0\51\1\2\0\1\1\7\3"+
-    "\1\0\3\3\1\0\4\3\1\1\10\0\1\3\10\0\2\1\2\3"+
-    "\2\0\12\4\12\0\6\1\2\0\2\3\1\0\22\1\3\0\30\1"+
-    "\1\0\11\1\1\0\1\1\2\0\7\1\3\0\1\3\4\0\6\3"+
-    "\1\0\1\3\1\0\10\3\22\0\2\3\15\0\60\20\1\21\2\20"+
-    "\7\21\5\0\7\20\10\21\1\0\12\4\47\0\2\20\1\0\1\20"+
-    "\2\0\2\20\1\0\1\20\2\0\1\20\6\0\4\20\1\0\7\20"+
-    "\1\0\3\20\1\0\1\20\1\0\1\20\2\0\2\20\1\0\4\20"+
-    "\1\21\2\20\6\21\1\0\2\21\1\20\2\0\5\20\1\0\1\20"+
-    "\1\0\6\21\2\0\12\4\2\0\4\20\40\0\1\1\27\0\2\3"+
-    "\6\0\12\4\13\0\1\3\1\0\1\3\1\0\1\3\4\0\2\3"+
-    "\10\1\1\0\44\1\4\0\24\3\1\0\2\3\5\1\13\3\1\0"+
-    "\44\3\11\0\1\3\71\0\53\20\24\21\1\20\12\4\6\0\6\20"+
-    "\4\21\4\20\3\21\1\20\3\21\2\20\7\21\3\20\4\21\15\20"+
-    "\14\21\1\20\1\21\12\4\4\21\2\20\46\1\1\0\1\1\5\0"+
-    "\1\1\2\0\53\1\1\0\4\1\u0100\2\111\1\1\0\4\1\2\0"+
-    "\7\1\1\0\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0"+
-    "\41\1\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0"+
-    "\17\1\1\0\71\1\1\0\4\1\2\0\103\1\2\0\3\3\40\0"+
-    "\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0"+
-    "\113\1\3\0\3\1\17\0\15\1\1\0\4\1\3\3\13\0\22\1"+
-    "\3\3\13\0\22\1\2\3\14\0\15\1\1\0\3\1\1\0\2\3"+
-    "\14\0\64\20\40\21\3\0\1\20\4\0\1\20\1\21\2\0\12\4"+
-    "\41\0\4\3\1\0\12\4\6\0\130\1\10\0\51\1\1\3\1\1"+
-    "\5\0\106\1\12\0\35\1\3\0\14\3\4\0\14\3\12\0\12\4"+
-    "\36\20\2\0\5\20\13\0\54\20\4\0\21\21\7\20\2\21\6\0"+
-    "\12\4\1\20\3\0\2\20\40\0\27\1\5\3\4\0\65\20\12\21"+
-    "\1\0\35\21\2\0\1\3\12\4\6\0\12\4\6\0\16\20\122\0"+
-    "\5\3\57\1\21\3\7\1\4\0\12\4\21\0\11\3\14\0\3\3"+
-    "\36\1\15\3\2\1\12\4\54\1\16\3\14\0\44\1\24\3\10\0"+
-    "\12\4\3\0\3\1\12\4\44\1\122\0\3\3\1\0\25\3\4\1"+
-    "\1\3\4\1\3\3\2\1\11\0\300\1\47\3\25\0\4\3\u0116\1"+
-    "\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0\1\1"+
-    "\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0\7\1"+
-    "\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0\6\1"+
-    "\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\3\10\0\2\10"+
-    "\12\0\1\10\2\0\1\6\2\0\5\3\20\0\2\11\3\0\1\7"+
-    "\17\0\1\11\13\0\5\3\1\0\12\3\1\0\1\1\15\0\1\1"+
-    "\20\0\15\1\63\0\41\3\21\0\1\1\4\0\1\1\2\0\12\1"+
-    "\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0\1\1"+
-    "\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0\1\1"+
-    "\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0\205\1"+
-    "\6\0\4\1\3\3\2\1\14\0\46\1\1\0\1\1\5\0\1\1"+
-    "\2\0\70\1\7\0\1\1\17\0\1\3\27\1\11\0\7\1\1\0"+
-    "\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
-    "\7\1\1\0\7\1\1\0\40\3\57\0\1\1\120\0\32\12\1\0"+
-    "\131\12\14\0\326\12\57\0\1\1\1\0\1\12\31\0\11\12\6\3"+
-    "\1\0\5\5\2\0\3\12\1\1\1\1\4\0\126\13\2\0\2\3"+
-    "\2\5\3\13\133\5\1\0\4\5\5\0\51\1\3\0\136\2\21\0"+
-    "\33\1\65\0\20\5\320\0\57\5\1\0\130\5\250\0\u19b6\12\112\0"+
-    "\u51cd\12\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\4"+
-    "\2\1\24\0\57\1\4\3\1\0\12\3\1\0\31\1\7\0\1\3"+
-    "\120\1\2\3\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
-    "\14\0\13\1\115\0\12\1\1\3\3\1\1\3\4\1\1\3\27\1"+
-    "\5\3\30\0\64\1\14\0\2\3\62\1\21\3\13\0\12\4\6\0"+
-    "\22\3\6\1\3\0\1\1\4\0\12\4\34\1\10\3\2\0\27\1"+
-    "\15\3\14\0\35\2\3\0\4\3\57\1\16\3\16\0\1\1\12\4"+
-    "\46\0\51\1\16\3\11\0\3\1\1\3\10\1\2\3\2\0\12\4"+
-    "\6\0\33\20\1\21\4\0\60\20\1\21\1\20\3\21\2\20\2\21"+
-    "\5\20\2\21\1\20\1\21\1\20\30\0\5\20\13\1\5\3\2\0"+
-    "\3\1\2\3\12\0\6\1\2\0\6\1\2\0\6\1\11\0\7\1"+
-    "\1\0\7\1\221\0\43\1\10\3\1\0\2\3\2\0\12\4\6\0"+
-    "\u2ba4\2\14\0\27\2\4\0\61\2\u2104\0\u016e\12\2\0\152\12\46\0"+
-    "\7\1\14\0\5\1\5\0\1\16\1\3\12\16\1\0\15\16\1\0"+
-    "\5\16\1\0\1\16\1\0\2\16\1\0\2\16\1\0\12\16\142\1"+
-    "\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\3"+
-    "\1\7\2\0\1\6\1\7\13\0\7\3\14\0\2\11\30\0\3\11"+
-    "\1\7\1\0\1\10\1\0\1\7\1\6\32\0\5\1\1\0\207\1"+
-    "\2\0\1\3\7\0\1\10\4\0\1\7\1\0\1\10\1\0\12\4"+
-    "\1\6\1\7\5\0\32\1\4\0\1\11\1\0\32\1\13\0\70\5"+
-    "\2\3\37\2\3\0\6\2\2\0\6\2\2\0\6\2\2\0\3\2"+
-    "\34\0\3\3\4\0\14\1\1\0\32\1\1\0\23\1\1\0\2\1"+
-    "\1\0\17\1\2\0\16\1\42\0\173\1\105\0\65\1\210\0\1\3"+
-    "\202\0\35\1\3\0\61\1\57\0\37\1\21\0\33\1\65\0\36\1"+
-    "\2\0\44\1\4\0\10\1\1\0\5\1\52\0\236\1\2\0\12\4"+
-    "\u0356\0\6\1\2\0\1\1\1\0\54\1\1\0\2\1\3\0\1\1"+
-    "\2\0\27\1\252\0\26\1\12\0\32\1\106\0\70\1\6\0\2\1"+
-    "\100\0\1\1\3\3\1\0\2\3\5\0\4\3\4\1\1\0\3\1"+
-    "\1\0\33\1\4\0\3\3\4\0\1\3\40\0\35\1\203\0\66\1"+
-    "\12\0\26\1\12\0\23\1\215\0\111\1\u03b7\0\3\3\65\1\17\3"+
-    "\37\0\12\4\20\0\3\3\55\1\13\3\2\0\1\3\22\0\31\1"+
-    "\7\0\12\4\6\0\3\3\44\1\16\3\1\0\12\4\100\0\3\3"+
-    "\60\1\16\3\4\1\13\0\12\4\u04a6\0\53\1\15\3\10\0\12\4"+
-    "\u0936\0\u036f\1\221\0\143\1\u0b9d\0\u042f\1\u33d1\0\u0239\1\u04c7\0\105\1"+
-    "\13\0\1\1\56\3\20\0\4\3\15\1\u4060\0\1\5\1\13\u2163\0"+
-    "\5\3\3\0\26\3\2\0\7\3\36\0\4\3\224\0\3\3\u01bb\0"+
-    "\125\1\1\0\107\1\1\0\2\1\2\0\1\1\2\0\2\1\2\0"+
-    "\4\1\1\0\14\1\1\0\1\1\1\0\7\1\1\0\101\1\1\0"+
-    "\4\1\2\0\10\1\1\0\7\1\1\0\34\1\1\0\4\1\1\0"+
-    "\5\1\1\0\1\1\3\0\7\1\1\0\u0154\1\2\0\31\1\1\0"+
-    "\31\1\1\0\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0"+
-    "\37\1\1\0\31\1\1\0\37\1\1\0\31\1\1\0\10\1\2\0"+
-    "\62\4\u1600\0\4\1\1\0\33\1\1\0\2\1\1\0\1\1\2\0"+
-    "\1\1\1\0\12\1\1\0\4\1\1\0\1\1\1\0\1\1\6\0"+
-    "\1\1\4\0\1\1\1\0\1\1\1\0\1\1\1\0\3\1\1\0"+
-    "\2\1\1\0\1\1\2\0\1\1\1\0\1\1\1\0\1\1\1\0"+
-    "\1\1\1\0\1\1\1\0\2\1\1\0\1\1\2\0\4\1\1\0"+
-    "\7\1\1\0\4\1\1\0\4\1\1\0\1\1\1\0\12\1\1\0"+
-    "\21\1\5\0\3\1\1\0\5\1\1\0\21\1\u032a\0\32\17\1\13"+
-    "\u0dff\0\ua6d7\12\51\0\u1035\12\13\0\336\12\u3fe2\0\u021e\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
-    "\1\3\36\0\140\3\200\0\360\3\uffff\0\uffff\0\ufe12\0";
+    "\42\0\1\32\1\7\3\0\1\31\2\0\1\7\1\0\1\24\1\0"+
+    "\1\25\1\0\12\21\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
+    "\32\15\56\0\1\4\1\15\2\0\1\5\1\4\6\0\1\15\1\0"+
+    "\1\23\2\0\1\15\5\0\27\15\1\0\37\15\1\0\u01ca\15\4\0"+
+    "\14\15\5\0\1\23\10\0\5\15\7\0\1\15\1\0\1\15\21\0"+
+    "\160\5\5\15\1\0\2\15\2\0\4\15\1\24\1\15\6\0\1\15"+
+    "\1\23\3\15\1\0\1\15\1\0\24\15\1\0\123\15\1\0\213\15"+
+    "\1\0\7\5\246\15\1\0\46\15\2\0\1\15\7\0\47\15\1\0"+
+    "\1\24\7\0\55\5\1\0\1\5\1\0\2\5\1\0\2\5\1\0"+
+    "\1\5\10\0\33\33\5\0\3\33\1\15\1\23\13\0\6\5\6\0"+
+    "\2\24\2\0\13\5\1\0\1\5\3\0\53\15\25\5\12\20\1\0"+
+    "\1\20\1\24\1\0\2\15\1\5\143\15\1\0\1\15\10\5\1\0"+
+    "\6\5\2\15\2\5\1\0\4\5\2\15\12\20\3\15\2\0\1\15"+
+    "\17\0\1\5\1\15\1\5\36\15\33\5\2\0\131\15\13\5\1\15"+
+    "\16\0\12\20\41\15\11\5\2\15\2\0\1\24\1\0\1\15\5\0"+
+    "\26\15\4\5\1\15\11\5\1\15\3\5\1\15\5\5\22\0\31\15"+
+    "\3\5\104\0\25\15\1\0\10\15\26\0\60\5\66\15\3\5\1\15"+
+    "\22\5\1\15\7\5\12\15\2\5\2\0\12\20\1\0\20\15\3\5"+
+    "\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\1\15"+
+    "\3\0\4\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5"+
+    "\1\15\10\0\1\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20"+
+    "\2\15\17\0\3\5\1\0\6\15\4\0\2\15\2\0\26\15\1\0"+
+    "\7\15\1\0\2\15\1\0\2\15\1\0\2\15\2\0\1\5\1\0"+
+    "\5\5\4\0\2\5\2\0\3\5\3\0\1\5\7\0\4\15\1\0"+
+    "\1\15\7\0\12\20\2\5\3\15\1\5\13\0\3\5\1\0\11\15"+
+    "\1\0\3\15\1\0\26\15\1\0\7\15\1\0\2\15\1\0\5\15"+
+    "\2\0\1\5\1\15\10\5\1\0\3\5\1\0\3\5\2\0\1\15"+
+    "\17\0\2\15\2\5\2\0\12\20\11\0\1\15\7\0\3\5\1\0"+
+    "\10\15\2\0\2\15\2\0\26\15\1\0\7\15\1\0\2\15\1\0"+
+    "\5\15\2\0\1\5\1\15\7\5\2\0\2\5\2\0\3\5\10\0"+
+    "\2\5\4\0\2\15\1\0\3\15\2\5\2\0\12\20\1\0\1\15"+
+    "\20\0\1\5\1\15\1\0\6\15\3\0\3\15\1\0\4\15\3\0"+
+    "\2\15\1\0\1\15\1\0\2\15\3\0\2\15\3\0\3\15\3\0"+
+    "\14\15\4\0\5\5\3\0\3\5\1\0\4\5\2\0\1\15\6\0"+
+    "\1\5\16\0\12\20\20\0\4\5\1\0\10\15\1\0\3\15\1\0"+
+    "\27\15\1\0\20\15\3\0\1\15\7\5\1\0\3\5\1\0\4\5"+
+    "\7\0\2\5\1\0\3\15\5\0\2\15\2\5\2\0\12\20\20\0"+
+    "\1\15\3\5\1\0\10\15\1\0\3\15\1\0\27\15\1\0\12\15"+
+    "\1\0\5\15\2\0\1\5\1\15\7\5\1\0\3\5\1\0\4\5"+
+    "\7\0\2\5\7\0\1\15\1\0\2\15\2\5\2\0\12\20\1\0"+
+    "\2\15\16\0\3\5\1\0\10\15\1\0\3\15\1\0\51\15\2\0"+
+    "\1\15\7\5\1\0\3\5\1\0\4\5\1\15\5\0\3\15\1\5"+
+    "\7\0\3\15\2\5\2\0\12\20\12\0\6\15\2\0\2\5\1\0"+
+    "\22\15\3\0\30\15\1\0\11\15\1\0\1\15\2\0\7\15\3\0"+
+    "\1\5\4\0\6\5\1\0\1\5\1\0\10\5\6\0\12\20\2\0"+
+    "\2\5\15\0\60\34\1\35\2\34\7\35\5\0\7\34\10\35\1\0"+
+    "\12\20\47\0\2\34\1\0\1\34\2\0\2\34\1\0\1\34\2\0"+
+    "\1\34\6\0\4\34\1\0\7\34\1\0\3\34\1\0\1\34\1\0"+
+    "\1\34\2\0\2\34\1\0\4\34\1\35\2\34\6\35\1\0\2\35"+
+    "\1\34\2\0\5\34\1\0\1\34\1\0\6\35\2\0\12\20\2\0"+
+    "\4\34\40\0\1\15\27\0\2\5\6\0\12\20\13\0\1\5\1\0"+
+    "\1\5\1\0\1\5\4\0\2\5\10\15\1\0\44\15\4\0\24\5"+
+    "\1\0\2\5\5\15\13\5\1\0\44\5\11\0\1\5\71\0\53\34"+
+    "\24\35\1\34\12\20\6\0\6\34\4\35\4\34\3\35\1\34\3\35"+
+    "\2\34\7\35\3\34\4\35\15\34\14\35\1\34\1\35\12\20\4\35"+
+    "\2\34\46\15\1\0\1\15\5\0\1\15\2\0\53\15\1\0\4\15"+
+    "\u0100\17\111\15\1\0\4\15\2\0\7\15\1\0\1\15\1\0\4\15"+
+    "\2\0\51\15\1\0\4\15\2\0\41\15\1\0\4\15\2\0\7\15"+
+    "\1\0\1\15\1\0\4\15\2\0\17\15\1\0\71\15\1\0\4\15"+
+    "\2\0\103\15\2\0\3\5\40\0\20\15\20\0\126\15\2\0\6\15"+
+    "\3\0\u026c\15\2\0\21\15\1\0\32\15\5\0\113\15\3\0\13\15"+
+    "\7\0\15\15\1\0\4\15\3\5\13\0\22\15\3\5\13\0\22\15"+
+    "\2\5\14\0\15\15\1\0\3\15\1\0\2\5\14\0\64\34\40\35"+
+    "\3\0\1\34\4\0\1\34\1\35\2\0\12\20\41\0\4\5\1\0"+
+    "\12\20\6\0\130\15\10\0\5\15\2\5\42\15\1\5\1\15\5\0"+
+    "\106\15\12\0\37\15\1\0\14\5\4\0\14\5\12\0\12\20\36\34"+
+    "\2\0\5\34\13\0\54\34\4\0\32\34\6\0\12\20\1\34\3\0"+
+    "\2\34\40\0\27\15\5\5\4\0\65\34\12\35\1\0\35\35\2\0"+
+    "\1\5\12\20\6\0\12\20\6\0\16\34\2\0\17\5\101\0\5\5"+
+    "\57\15\21\5\7\15\4\0\12\20\21\0\11\5\14\0\3\5\36\15"+
+    "\15\5\2\15\12\20\54\15\16\5\14\0\44\15\24\5\10\0\12\20"+
+    "\3\0\3\15\12\20\44\15\2\0\11\15\107\0\3\5\1\0\25\5"+
+    "\4\15\1\5\4\15\3\5\2\15\1\0\2\5\6\0\300\15\66\5"+
+    "\5\0\5\5\u0116\15\2\0\6\15\2\0\46\15\2\0\6\15\2\0"+
+    "\10\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\37\15\2\0"+
+    "\65\15\1\0\7\15\1\0\1\15\3\0\3\15\1\0\7\15\3\0"+
+    "\4\15\2\0\6\15\4\0\15\15\5\0\3\15\1\0\7\15\17\0"+
+    "\1\5\1\12\2\5\10\0\2\25\12\0\1\25\2\0\1\23\2\0"+
+    "\5\5\1\26\14\0\1\4\2\0\2\26\3\0\1\24\4\0\1\4"+
+    "\12\0\1\26\13\0\5\5\1\0\12\5\1\0\1\15\15\0\1\15"+
+    "\20\0\15\15\63\0\23\5\1\10\15\5\21\0\1\15\4\0\1\15"+
+    "\2\0\12\15\1\0\1\15\3\0\5\15\4\0\1\4\1\0\1\15"+
+    "\1\0\1\15\1\0\1\15\1\0\4\15\1\0\12\15\1\16\2\0"+
+    "\4\15\5\0\5\15\4\0\1\15\21\0\51\15\13\0\6\4\17\0"+
+    "\2\4\u016f\0\2\4\14\0\1\4\137\0\1\4\106\0\1\4\31\0"+
+    "\13\4\4\0\3\4\273\0\14\15\1\16\47\15\300\0\2\4\12\0"+
+    "\1\4\11\0\1\4\72\0\4\4\1\0\5\4\1\4\1\0\7\4"+
+    "\1\4\2\4\1\4\1\4\1\0\2\4\2\4\1\4\4\4\1\3"+
+    "\2\4\1\4\1\4\2\4\2\4\1\4\3\4\1\4\3\4\2\4"+
+    "\10\4\3\4\5\4\1\4\1\4\1\4\5\4\14\4\13\4\2\4"+
+    "\2\4\1\4\1\4\2\4\1\4\1\4\22\4\1\4\2\4\2\4"+
+    "\6\4\12\0\2\4\6\4\1\4\1\4\1\4\2\4\3\4\2\4"+
+    "\10\4\2\4\4\4\2\4\13\4\2\4\5\4\2\4\2\4\1\4"+
+    "\5\4\2\4\1\4\1\4\1\4\2\4\24\4\2\4\5\4\6\4"+
+    "\1\4\2\4\1\3\1\4\2\4\1\4\4\4\1\4\2\4\1\4"+
+    "\2\0\2\4\4\3\1\4\1\4\2\4\1\4\1\0\1\4\1\0"+
+    "\1\4\6\0\1\4\3\0\1\4\6\0\1\4\12\0\2\4\17\0"+
+    "\1\4\2\0\1\4\4\0\1\4\1\0\1\4\4\0\3\4\1\0"+
+    "\1\4\13\0\2\4\3\4\55\0\3\4\11\0\1\4\16\0\1\4"+
+    "\16\0\1\4\u0174\0\2\4\u01cf\0\3\4\23\0\2\4\63\0\1\4"+
+    "\4\0\1\4\252\0\57\15\1\0\57\15\1\0\205\15\6\0\4\15"+
+    "\3\5\2\15\14\0\46\15\1\0\1\15\5\0\1\15\2\0\70\15"+
+    "\7\0\1\15\17\0\1\5\27\15\11\0\7\15\1\0\7\15\1\0"+
+    "\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0\7\15\1\0"+
+    "\7\15\1\0\40\5\57\0\1\15\120\0\32\27\1\0\131\27\14\0"+
+    "\326\27\57\0\1\15\1\0\1\27\31\0\11\27\6\5\1\4\5\22"+
+    "\2\0\3\27\1\15\1\15\1\4\3\0\126\30\2\0\2\5\2\22"+
+    "\3\30\133\22\1\0\4\22\5\0\51\15\3\0\136\17\21\0\33\15"+
+    "\65\0\20\22\227\0\1\4\1\0\1\4\66\0\57\22\1\0\130\22"+
+    "\250\0\u19b6\27\112\0\u51d6\27\52\0\u048d\15\103\0\56\15\2\0\u010d\15"+
+    "\3\0\20\15\12\20\2\15\24\0\57\15\4\5\1\0\12\5\1\0"+
+    "\37\15\2\5\120\15\2\5\45\0\11\15\2\0\147\15\2\0\44\15"+
+    "\1\0\10\15\77\0\13\15\1\5\3\15\1\5\4\15\1\5\27\15"+
+    "\5\5\30\0\64\15\14\0\2\5\62\15\22\5\12\0\12\20\6\0"+
+    "\22\5\6\15\3\0\1\15\1\0\1\15\2\0\12\20\34\15\10\5"+
+    "\2\0\27\15\15\5\14\0\35\17\3\0\4\5\57\15\16\5\16\0"+
+    "\1\15\12\20\6\0\5\34\1\35\12\34\12\20\5\34\1\0\51\15"+
+    "\16\5\11\0\3\15\1\5\10\15\2\5\2\0\12\20\6\0\33\34"+
+    "\3\35\62\34\1\35\1\34\3\35\2\34\2\35\5\34\2\35\1\34"+
+    "\1\35\1\34\30\0\5\34\13\15\5\5\2\0\3\15\2\5\12\0"+
+    "\6\15\2\0\6\15\2\0\6\15\11\0\7\15\1\0\7\15\1\0"+
+    "\53\15\1\0\12\15\12\0\163\15\10\5\1\0\2\5\2\0\12\20"+
+    "\6\0\u2ba4\17\14\0\27\17\4\0\61\17\u2104\0\u016e\27\2\0\152\27"+
+    "\46\0\7\15\14\0\5\15\5\0\1\33\1\5\12\33\1\0\15\33"+
+    "\1\0\5\33\1\0\1\33\1\0\2\33\1\0\2\33\1\0\12\33"+
+    "\142\15\41\0\u016b\15\22\0\100\15\2\0\66\15\50\0\14\15\4\0"+
+    "\16\5\1\6\1\11\1\24\2\0\1\23\1\24\13\0\20\5\3\0"+
+    "\2\26\30\0\3\26\1\24\1\0\1\25\1\0\1\24\1\23\32\0"+
+    "\5\15\1\0\207\15\2\0\1\5\7\0\1\25\4\0\1\24\1\0"+
+    "\1\25\1\0\12\20\1\23\1\24\5\0\32\15\4\0\1\26\1\0"+
+    "\32\15\13\0\70\22\2\5\37\17\3\0\6\17\2\0\6\17\2\0"+
+    "\6\17\2\0\3\17\34\0\3\5\4\0\14\15\1\0\32\15\1\0"+
+    "\23\15\1\0\2\15\1\0\17\15\2\0\16\15\42\0\173\15\105\0"+
+    "\65\15\210\0\1\5\202\0\35\15\3\0\61\15\17\0\1\5\37\0"+
+    "\40\15\20\0\33\15\5\0\46\15\5\5\5\0\36\15\2\0\44\15"+
+    "\4\0\10\15\1\0\5\15\52\0\236\15\2\0\12\20\6\0\44\15"+
+    "\4\0\44\15\4\0\50\15\10\0\64\15\234\0\u0137\15\11\0\26\15"+
+    "\12\0\10\15\230\0\6\15\2\0\1\15\1\0\54\15\1\0\2\15"+
+    "\3\0\1\15\2\0\27\15\12\0\27\15\11\0\37\15\101\0\23\15"+
+    "\1\0\2\15\12\0\26\15\12\0\32\15\106\0\70\15\6\0\2\15"+
+    "\100\0\1\15\3\5\1\0\2\5\5\0\4\5\4\15\1\0\3\15"+
+    "\1\0\33\15\4\0\3\5\4\0\1\5\40\0\35\15\3\0\35\15"+
+    "\43\0\10\15\1\0\34\15\2\5\31\0\66\15\12\0\26\15\12\0"+
+    "\23\15\15\0\22\15\156\0\111\15\67\0\63\15\15\0\63\15\u030d\0"+
+    "\3\5\65\15\17\5\37\0\12\20\17\0\4\5\55\15\13\5\2\0"+
+    "\1\5\22\0\31\15\7\0\12\20\6\0\3\5\44\15\16\5\1\0"+
+    "\12\20\20\0\43\15\1\5\2\0\1\15\11\0\3\5\60\15\16\5"+
+    "\4\15\5\0\3\5\3\0\12\20\1\15\1\0\1\15\43\0\22\15"+
+    "\1\0\31\15\14\5\6\0\1\5\101\0\7\15\1\0\1\15\1\0"+
+    "\4\15\1\0\17\15\1\0\12\15\7\0\57\15\14\5\5\0\12\20"+
+    "\6\0\4\5\1\0\10\15\2\0\2\15\2\0\26\15\1\0\7\15"+
+    "\1\0\2\15\1\0\5\15\2\0\1\5\1\15\7\5\2\0\2\5"+
+    "\2\0\3\5\2\0\1\15\6\0\1\5\5\0\5\15\2\5\2\0"+
+    "\7\5\3\0\5\5\213\0\65\15\22\5\4\15\5\0\12\20\46\0"+
+    "\60\15\24\5\2\15\1\0\1\15\10\0\12\20\246\0\57\15\7\5"+
+    "\2\0\11\5\27\0\4\15\2\5\42\0\60\15\21\5\3\0\1\15"+
+    "\13\0\12\20\46\0\53\15\15\5\10\0\12\20\66\0\32\34\3\0"+
+    "\17\35\4\0\12\20\2\34\3\0\1\34\u0160\0\100\15\12\20\25\0"+
+    "\1\15\u01c0\0\71\15\u0107\0\11\15\1\0\45\15\10\5\1\0\10\5"+
+    "\1\15\17\0\12\20\30\0\36\15\2\0\26\5\1\0\16\5\u0349\0"+
+    "\u039a\15\146\0\157\15\21\0\304\15\u0abc\0\u042f\15\u0fd1\0\u0247\15\u21b9\0"+
+    "\u0239\15\7\0\37\15\1\0\12\20\146\0\36\15\2\0\5\5\13\0"+
+    "\60\15\7\5\11\0\4\15\14\0\12\20\11\0\25\15\5\0\23\15"+
+    "\u0370\0\105\15\13\0\1\15\56\5\20\0\4\5\15\15\100\0\1\15"+
+    "\u401f\0\1\22\1\30\u0bfe\0\153\15\5\0\15\15\3\0\11\15\7\0"+
+    "\12\15\3\0\2\5\1\0\4\5\u14c1\0\5\5\3\0\26\5\2\0"+
+    "\7\5\36\0\4\5\224\0\3\5\u01bb\0\125\15\1\0\107\15\1\0"+
+    "\2\15\2\0\1\15\2\0\2\15\2\0\4\15\1\0\14\15\1\0"+
+    "\1\15\1\0\7\15\1\0\101\15\1\0\4\15\2\0\10\15\1\0"+
+    "\7\15\1\0\34\15\1\0\4\15\1\0\5\15\1\0\1\15\3\0"+
+    "\7\15\1\0\u0154\15\2\0\31\15\1\0\31\15\1\0\37\15\1\0"+
+    "\31\15\1\0\37\15\1\0\31\15\1\0\37\15\1\0\31\15\1\0"+
+    "\37\15\1\0\31\15\1\0\10\15\2\0\62\20\u0200\0\67\5\4\0"+
+    "\62\5\10\0\1\5\16\0\1\5\26\0\5\5\1\0\17\5\u0550\0"+
+    "\7\5\1\0\21\5\2\0\7\5\1\0\2\5\1\0\5\5\u07d5\0"+
+    "\305\15\13\0\7\5\51\0\104\15\7\5\5\0\12\20\u04a6\0\4\15"+
+    "\1\0\33\15\1\0\2\15\1\0\1\15\2\0\1\15\1\0\12\15"+
+    "\1\0\4\15\1\0\1\15\1\0\1\15\6\0\1\15\4\0\1\15"+
+    "\1\0\1\15\1\0\1\15\1\0\3\15\1\0\2\15\1\0\1\15"+
+    "\2\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15\1\0\1\15"+
+    "\1\0\2\15\1\0\1\15\2\0\4\15\1\0\7\15\1\0\4\15"+
+    "\1\0\4\15\1\0\1\15\1\0\12\15\1\0\21\15\5\0\3\15"+
+    "\1\0\5\15\1\0\21\15\u0144\0\4\4\1\4\312\4\1\4\60\4"+
+    "\15\0\3\4\37\0\1\4\32\15\6\0\32\15\2\0\4\4\2\16"+
+    "\14\15\2\16\12\15\4\0\1\4\2\0\12\4\22\0\71\4\32\1"+
+    "\1\30\2\4\15\4\12\0\1\4\24\0\1\4\2\0\11\4\1\0"+
+    "\4\4\11\0\7\4\2\4\256\4\42\4\2\4\141\4\1\3\16\4"+
+    "\2\4\2\4\1\4\3\4\2\4\44\4\3\3\2\4\1\3\2\4"+
+    "\3\3\44\4\2\4\3\4\1\4\4\4\5\2\102\4\2\3\2\4"+
+    "\13\3\25\4\4\3\4\4\1\3\1\4\11\3\3\4\1\3\4\4"+
+    "\3\3\1\4\3\3\42\4\1\3\123\4\1\4\77\4\10\0\3\4"+
+    "\6\4\1\4\30\4\7\4\2\4\2\4\1\4\2\3\4\4\1\3"+
+    "\14\4\1\4\2\4\4\4\2\4\1\3\4\4\2\3\15\4\2\4"+
+    "\2\4\1\4\10\4\2\4\11\4\1\4\5\4\3\4\14\4\3\4"+
+    "\10\4\3\4\2\4\1\4\1\4\1\4\4\4\1\4\6\4\1\4"+
+    "\3\4\1\4\6\4\113\4\3\3\3\4\5\3\60\0\43\4\1\3"+
+    "\20\4\3\3\11\4\1\3\5\4\5\4\1\4\1\3\6\4\15\4"+
+    "\6\4\3\4\1\4\1\4\2\4\3\4\1\4\2\4\7\4\6\4"+
+    "\164\0\14\4\125\0\53\4\14\0\4\4\70\0\10\4\12\0\6\4"+
+    "\50\0\10\4\36\0\122\4\14\0\4\4\10\4\5\3\1\4\2\3"+
+    "\6\4\1\3\11\4\12\3\1\4\1\0\1\4\2\3\1\4\6\4"+
+    "\1\0\52\4\2\4\4\4\3\4\1\4\1\4\47\4\15\4\5\4"+
+    "\2\3\1\4\2\3\6\4\3\4\15\4\1\4\15\3\42\4\u05fe\4"+
+    "\2\0\ua6d7\27\51\0\u1035\27\13\0\336\27\2\0\u1682\27\u295e\0\u021e\27"+
+    "\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\u05ee\0"+
+    "\1\5\36\0\137\13\1\14\200\0\360\5\uffff\0\uffff\0\ufe12\0";

  /** 
   * Translates characters to character classes
@ -218,12 +284,15 @@ public final class StandardTokenizerImpl {
  private static final int [] ZZ_ACTION = zzUnpackAction();

  private static final String ZZ_ACTION_PACKED_0 =
-    "\1\0\1\1\1\2\1\3\1\4\1\5\1\1\1\6"+
-    "\1\7\1\2\1\1\1\10\1\2\1\0\1\2\1\0"+
-    "\1\4\1\0\2\2\2\0\1\1\1\0";
+    "\1\0\2\1\3\2\2\1\1\3\1\2\1\4\2\5"+
+    "\1\6\1\1\1\7\1\10\1\3\1\11\1\2\1\0"+
+    "\4\2\1\0\1\2\2\0\1\3\1\0\1\3\2\2"+
+    "\1\0\1\5\1\2\1\5\1\0\2\3\1\0\2\2"+
+    "\2\0\1\2\1\0\2\3\5\2\1\0\1\2\1\3"+
+    "\3\2";

  private static int [] zzUnpackAction() {
-    int [] result = new int[24];
+    int [] result = new int[61];
    int offset = 0;
    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
    return result;
@ -248,12 +317,17 @@ public final class StandardTokenizerImpl {
  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();

  private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\22\0\44\0\66\0\110\0\132\0\154\0\176"+
-    "\0\220\0\242\0\264\0\306\0\330\0\352\0\374\0\u010e"+
-    "\0\u0120\0\154\0\u0132\0\u0144\0\u0156\0\264\0\u0168\0\u017a";
+    "\0\0\0\36\0\74\0\132\0\170\0\226\0\264\0\322"+
+    "\0\360\0\u010e\0\u012c\0\u014a\0\u0168\0\u0186\0\u01a4\0\u01c2"+
+    "\0\u01e0\0\u01fe\0\u021c\0\u023a\0\74\0\u0258\0\u0276\0\u0294"+
+    "\0\u02b2\0\264\0\u02d0\0\u02ee\0\322\0\u030c\0\u032a\0\u0348"+
+    "\0\u0366\0\u0384\0\u03a2\0\u03c0\0\u03de\0\u03fc\0\u01a4\0\u041a"+
+    "\0\u0438\0\u0456\0\u0474\0\u0492\0\u04b0\0\u04ce\0\u04ec\0\u050a"+
+    "\0\u0528\0\u0546\0\u0564\0\u0582\0\u05a0\0\u05be\0\u05dc\0\u05fa"+
+    "\0\36\0\u0618\0\360\0\u0636\0\u0654";

  private static int [] zzUnpackRowMap() {
-    int [] result = new int[24];
+    int [] result = new int[61];
    int offset = 0;
    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
    return result;
@ -276,33 +350,94 @@ public final class StandardTokenizerImpl {
  private static final int [] ZZ_TRANS = zzUnpackTrans();

  private static final String ZZ_TRANS_PACKED_0 =
-    "\1\2\1\3\1\4\1\2\1\5\1\6\3\2\1\7"+
-    "\1\10\1\11\2\2\1\12\1\13\2\14\23\0\3\3"+
-    "\1\15\1\0\1\16\1\0\1\16\1\17\2\0\1\16"+
-    "\1\0\1\12\2\0\1\3\1\0\1\3\2\4\1\15"+
-    "\1\0\1\16\1\0\1\16\1\17\2\0\1\16\1\0"+
-    "\1\12\2\0\1\4\1\0\2\3\2\5\2\0\2\20"+
-    "\1\21\2\0\1\20\1\0\1\12\2\0\1\5\3\0"+
-    "\1\6\1\0\1\6\3\0\1\17\7\0\1\6\1\0"+
-    "\2\3\1\22\1\5\1\23\3\0\1\22\4\0\1\12"+
-    "\2\0\1\22\3\0\1\10\15\0\1\10\3\0\1\11"+
-    "\15\0\1\11\1\0\2\3\1\12\1\15\1\0\1\16"+
-    "\1\0\1\16\1\17\2\0\1\24\1\25\1\12\2\0"+
-    "\1\12\3\0\1\26\13\0\1\27\1\0\1\26\3\0"+
-    "\1\14\14\0\2\14\1\0\2\3\2\15\2\0\2\30"+
-    "\1\17\2\0\1\30\1\0\1\12\2\0\1\15\1\0"+
-    "\2\3\1\16\12\0\1\3\2\0\1\16\1\0\2\3"+
-    "\1\17\1\15\1\23\3\0\1\17\4\0\1\12\2\0"+
-    "\1\17\3\0\1\20\1\5\14\0\1\20\1\0\2\3"+
-    "\1\21\1\5\1\23\3\0\1\21\4\0\1\12\2\0"+
-    "\1\21\3\0\1\23\1\0\1\23\3\0\1\17\7\0"+
-    "\1\23\1\0\2\3\1\24\1\15\4\0\1\17\4\0"+
-    "\1\12\2\0\1\24\3\0\1\25\12\0\1\24\2\0"+
-    "\1\25\3\0\1\27\13\0\1\27\1\0\1\27\3\0"+
-    "\1\30\1\15\14\0\1\30";
+    "\1\2\1\3\1\4\1\5\1\6\2\2\1\7\2\2"+
+    "\1\10\2\2\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\3\2\1\17\1\20\1\21\2\2\1\22\2\23\37\0"+
+    "\1\24\3\0\2\25\1\0\5\25\20\0\1\25\5\0"+
+    "\1\4\2\0\1\4\1\0\1\26\2\4\20\0\1\4"+
+    "\2\0\1\4\2\0\1\5\2\0\1\5\1\27\1\30"+
+    "\2\5\20\0\1\5\5\0\1\6\2\0\1\6\1\27"+
+    "\1\31\2\6\20\0\1\6\5\0\1\32\2\0\1\33"+
+    "\1\34\3\32\20\0\1\32\3\0\1\5\1\6\5\0"+
+    "\1\35\3\0\1\6\24\0\2\11\1\0\10\11\2\36"+
+    "\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
+    "\1\22\1\0\1\11\5\0\1\12\1\11\1\0\1\12"+
+    "\1\41\1\42\2\12\3\11\2\36\1\0\1\37\1\0"+
+    "\1\37\1\40\2\0\1\37\1\0\1\22\1\0\1\12"+
+    "\5\0\2\13\1\0\5\13\2\11\1\13\2\36\1\0"+
+    "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+    "\1\0\1\13\5\0\2\14\1\0\5\14\3\11\2\14"+
+    "\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
+    "\1\14\5\0\1\15\1\14\1\0\1\45\1\46\3\15"+
+    "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+    "\1\22\1\0\1\15\5\0\2\16\1\0\5\16\5\0"+
+    "\1\16\3\0\1\40\6\0\1\16\5\0\2\47\1\0"+
+    "\5\47\3\11\2\14\1\50\3\0\1\47\4\0\1\22"+
+    "\1\0\1\47\5\0\2\20\1\0\5\20\20\0\1\20"+
+    "\5\0\2\21\1\0\5\21\20\0\1\21\5\0\2\22"+
+    "\1\0\5\22\3\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\51\1\52\1\22\1\0\1\22\5\0"+
+    "\2\23\1\0\5\23\17\0\2\23\5\0\2\24\1\0"+
+    "\5\24\20\0\1\24\2\0\1\4\1\53\1\54\1\4"+
+    "\2\0\1\4\1\0\1\26\2\4\1\0\1\54\16\0"+
+    "\1\4\12\0\1\55\1\56\24\0\1\4\1\53\1\54"+
+    "\1\5\2\0\1\5\1\27\1\30\2\5\1\0\1\54"+
+    "\16\0\1\5\2\0\1\4\1\53\1\54\1\6\2\0"+
+    "\1\6\1\27\1\31\2\6\1\0\1\54\16\0\1\6"+
+    "\5\0\1\33\2\0\1\33\1\34\3\33\20\0\1\33"+
+    "\10\0\1\57\32\0\2\36\1\0\5\36\3\11\2\36"+
+    "\2\0\2\60\1\40\2\0\1\60\1\0\1\22\1\0"+
+    "\1\36\5\0\2\37\1\0\5\37\3\11\13\0\1\11"+
+    "\1\0\1\37\5\0\2\40\1\0\5\40\3\11\2\36"+
+    "\1\50\3\0\1\40\4\0\1\22\1\0\1\40\5\0"+
+    "\2\11\1\0\2\11\1\61\1\62\4\11\2\36\1\0"+
+    "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+    "\1\0\1\11\2\0\1\4\1\53\1\54\1\12\1\11"+
+    "\1\0\1\12\1\41\1\42\2\12\1\11\1\63\1\11"+
+    "\2\36\1\0\1\37\1\0\1\37\1\40\2\0\1\37"+
+    "\1\0\1\22\1\0\1\12\5\0\2\43\1\0\5\43"+
+    "\3\0\2\14\13\0\1\43\5\0\2\44\1\0\5\44"+
+    "\3\11\2\14\1\50\3\0\1\44\4\0\1\22\1\0"+
+    "\1\44\5\0\1\45\1\14\1\0\1\45\1\46\3\45"+
+    "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+    "\1\22\1\0\1\45\5\0\2\14\1\0\1\64\4\14"+
+    "\3\11\2\14\2\0\2\43\1\44\2\0\1\43\1\0"+
+    "\1\22\1\0\1\14\5\0\2\50\1\0\5\50\5\0"+
+    "\1\50\3\0\1\40\6\0\1\50\5\0\2\51\1\0"+
+    "\5\51\3\11\2\36\4\0\1\40\4\0\1\22\1\0"+
+    "\1\51\5\0\2\52\1\0\5\52\16\0\1\51\1\0"+
+    "\1\52\2\0\1\4\2\0\1\53\2\0\1\53\1\65"+
+    "\1\66\2\53\20\0\1\53\5\0\1\54\2\0\1\54"+
+    "\1\65\1\67\2\54\20\0\1\54\2\0\1\4\1\53"+
+    "\1\54\5\0\1\70\3\0\1\54\32\0\1\56\1\71"+
+    "\26\0\1\57\2\0\1\57\1\0\3\57\20\0\1\57"+
+    "\5\0\2\60\1\0\5\60\3\0\2\36\13\0\1\60"+
+    "\2\0\1\4\1\53\1\54\2\11\1\0\2\11\1\72"+
+    "\3\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\37\1\0\1\22\1\0\1\11\5\0"+
+    "\2\11\1\0\3\11\1\62\1\73\3\11\2\36\1\0"+
+    "\1\37\1\0\1\37\1\40\2\0\1\37\1\0\1\22"+
+    "\1\0\1\11\5\0\1\63\1\11\1\0\1\63\1\74"+
+    "\1\75\2\63\3\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\37\1\0\1\22\1\0\1\63\5\0"+
+    "\1\64\1\14\1\0\1\64\1\14\3\64\3\11\2\14"+
+    "\2\0\2\43\1\44\2\0\1\43\1\0\1\22\1\0"+
+    "\1\64\12\0\1\55\25\0\1\4\1\53\1\54\1\53"+
+    "\2\0\1\53\1\65\1\66\2\53\1\0\1\54\16\0"+
+    "\1\53\2\0\1\4\1\53\2\54\2\0\1\54\1\65"+
+    "\1\67\2\54\1\0\1\54\16\0\1\54\3\0\1\53"+
+    "\1\54\5\0\1\70\3\0\1\54\22\0\1\53\1\54"+
+    "\2\11\1\0\2\11\1\72\3\11\1\63\1\11\2\36"+
+    "\1\0\1\37\1\0\1\37\1\40\2\0\1\37\1\0"+
+    "\1\22\1\0\1\11\5\0\2\11\1\0\2\11\1\61"+
+    "\5\11\2\36\1\0\1\37\1\0\1\37\1\40\2\0"+
+    "\1\37\1\0\1\22\1\0\1\11\2\0\1\4\1\53"+
+    "\1\54\1\63\1\11\1\0\1\63\1\74\1\75\2\63"+
+    "\1\11\1\63\1\11\2\36\1\0\1\37\1\0\1\37"+
+    "\1\40\2\0\1\37\1\0\1\22\1\0\1\63";

  private static int [] zzUnpackTrans() {
-    int [] result = new int[396];
+    int [] result = new int[1650];
    int offset = 0;
    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
    return result;
@ -329,7 +464,7 @@ public final class StandardTokenizerImpl {

  /* error messages for the codes above */
  private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
+    "Unknown internal scanner error",
    "Error: could not match input",
    "Error: pushback value was too large"
  };
@ -340,11 +475,12 @@ public final class StandardTokenizerImpl {
  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();

  private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\1\0\1\11\13\1\1\0\1\1\1\0\1\1\1\0"+
-    "\2\1\2\0\1\1\1\0";
+    "\1\0\1\11\22\1\1\0\4\1\1\0\1\1\2\0"+
+    "\1\1\1\0\3\1\1\0\3\1\1\0\2\1\1\0"+
+    "\2\1\2\0\1\1\1\0\7\1\1\0\1\11\4\1";

  private static int [] zzUnpackAttribute() {
-    int [] result = new int[24];
+    int [] result = new int[61];
    int offset = 0;
    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
    return result;
@ -401,11 +537,11 @@ public final class StandardTokenizerImpl {
  private int yycolumn;

  /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   * zzAtBOL == true iff the scanner is currently at the beginning of a line
   */
  private boolean zzAtBOL = true;

-  /** zzAtEOF == true <=> the scanner is at the EOF */
+  /** zzAtEOF == true iff the scanner is at the EOF */
  private boolean zzAtEOF;

  /** denotes if the user-EOF-code has already been executed */
@ -447,6 +583,9 @@ public final class StandardTokenizerImpl {

  /** Hangul token type */
  public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+  
+  /** Emoji token type */
+  public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;

  /** Character count processed so far */
  public final int yychar()
@ -492,7 +631,7 @@ public final class StandardTokenizerImpl {
    char [] map = new char[0x110000];
    int i = 0;  /* index in packed string  */
    int j = 0;  /* index in unpacked array */
-    while (i < 2836) {
+    while (i < 4122) {
      int  count = packed.charAt(i++);
      char value = packed.charAt(i++);
      do map[j++] = value; while (--count > 0);
@ -500,6 +639,8 @@ public final class StandardTokenizerImpl {
    return map;
  }

+/* -------------------------------------------------------------------------------- */
+/* Begin Lucene-specific disable-buffer-expansion modifications to skeleton.default */

  /**
   * Refills the input buffer.
@ -527,32 +668,45 @@ public final class StandardTokenizerImpl {


    /* fill the buffer with new input */
-    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;           
-    int totalRead = 0;
-    while (totalRead < requested) {
-      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
-      if (numRead == -1) {
-        break;
-      }
-      totalRead += numRead;
+    int requested = zzBuffer.length - zzEndRead - zzFinalHighSurrogate;
+    if (requested == 0) {
+      return true;
    }
+    int numRead = zzReader.read(zzBuffer, zzEndRead, requested);

-    if (totalRead > 0) {
-      zzEndRead += totalRead;
-      if (totalRead == requested) { /* possibly more input available */
-        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+    /* not supposed to occur according to specification of java.io.Reader */
+    if (numRead == 0) {
+      throw new java.io.IOException("Reader returned 0 characters. See JFlex examples for workaround.");
+    }
+    if (numRead > 0) {
+      zzEndRead += numRead;
+      if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+        if (numRead == requested) { // We might have requested too few chars to encode a full Unicode character.
          --zzEndRead;
          zzFinalHighSurrogate = 1;
-          if (totalRead == 1) { return true; }
+          if (numRead == 1) {
+            return true;
+          }
+        } else {                    // There is room in the buffer for at least one more char
+          int c = zzReader.read();  // Expecting to read a low surrogate char
+          if (c == -1) {
+            return true;
+          } else {
+            zzBuffer[zzEndRead++] = (char)c;
+            return false;
+          }
        }
      }
+      /* potentially more input available */
      return false;
    }

-    // totalRead = 0: End of stream
+    /* numRead < 0 ==> end of stream */
    return true;
  }

+/* End Lucene-specific disable-buffer-expansion modifications to skeleton.default */
+/* ------------------------------------------------------------------------------ */
    
  /**
   * Closes the input stream.
@ -773,49 +927,62 @@ public final class StandardTokenizerImpl {
      // store back cached position
      zzMarkedPos = zzMarkedPosL;

-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 1: 
-          { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
-          }
-        case 9: break;
-        case 2: 
-          { return WORD_TYPE;
-          }
-        case 10: break;
-        case 3: 
-          { return HANGUL_TYPE;
-          }
-        case 11: break;
-        case 4: 
-          { return NUMERIC_TYPE;
-          }
-        case 12: break;
-        case 5: 
-          { return KATAKANA_TYPE;
-          }
-        case 13: break;
-        case 6: 
-          { return IDEOGRAPHIC_TYPE;
-          }
-        case 14: break;
-        case 7: 
-          { return HIRAGANA_TYPE;
-          }
-        case 15: break;
-        case 8: 
-          { return SOUTH_EAST_ASIAN_TYPE;
-          }
-        case 16: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
+      if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+        zzAtEOF = true;
              {
                return YYEOF;
              }
-          } 
-          else {
+      }
+      else {
+        switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+          case 1: 
+            { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */
+            } 
+            // fall through
+          case 10: break;
+          case 2: 
+            { return EMOJI_TYPE;
+            } 
+            // fall through
+          case 11: break;
+          case 3: 
+            { return WORD_TYPE;
+            } 
+            // fall through
+          case 12: break;
+          case 4: 
+            { return HANGUL_TYPE;
+            } 
+            // fall through
+          case 13: break;
+          case 5: 
+            { return NUMERIC_TYPE;
+            } 
+            // fall through
+          case 14: break;
+          case 6: 
+            { return KATAKANA_TYPE;
+            } 
+            // fall through
+          case 15: break;
+          case 7: 
+            { return IDEOGRAPHIC_TYPE;
+            } 
+            // fall through
+          case 16: break;
+          case 8: 
+            { return HIRAGANA_TYPE;
+            } 
+            // fall through
+          case 17: break;
+          case 9: 
+            { return SOUTH_EAST_ASIAN_TYPE;
+            } 
+            // fall through
+          case 18: break;
+          default:
            zzScanError(ZZ_NO_MATCH);
-          }
+        }
      }
    }
  }
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@ -34,12 +34,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
 *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
 *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
+ *   <li>&lt;EMOJI&gt;: A sequence of Emoji characters</li>
 * </ul>
 */
@SuppressWarnings("fallthrough")
 %%

-%unicode 6.3
+%unicode 9.0
 %integer
 %final
 %public
@ -48,22 +49,67 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %char
 %buffer 255

-// UAX#29 WB4. X (Extend | Format)* --> X
+
+//////////////////////////////////////////////////////////////////////////
+// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
+
+// TODO: Remove this include file when JFlex supports these properties directly (in Unicode 11.0+)
+%include ../../../../../../data/jflex/UnicodeEmojiProperties.jflex
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
 //
-HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
-HebrewOrALetterEx   = [\p{WB:HebrewLetter}\p{WB:ALetter}]                       [\p{WB:Format}\p{WB:Extend}]*
-NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx          = \p{WB:Katakana}                                           [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       [\p{WB:Format}\p{WB:Extend}]*
-HanEx               = \p{Script:Han}                                            [\p{WB:Format}\p{WB:Extend}]*
-HiraganaEx          = \p{Script:Hiragana}                                       [\p{WB:Format}\p{WB:Extend}]*
-SingleQuoteEx       = \p{WB:Single_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-DoubleQuoteEx       = \p{WB:Double_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
-HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      [\p{WB:Format}\p{WB:Extend}]*
-RegionalIndicatorEx = \p{WB:RegionalIndicator}                                  [\p{WB:Format}\p{WB:Extend}]*
-ComplexContextEx    = \p{LB:Complex_Context}                                    [\p{WB:Format}\p{WB:Extend}]*
+//   \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
+//   - are explicitly excluded here so that we can properly handle Emoji sequences.
+//
+ExtFmtZwjSansPresSel = [[\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]--[\uFE0E\uFE0F]]*
+
+KeyCapBaseChar = [0-9#*]
+KeyCapBaseCharEx = {KeyCapBaseChar} {ExtFmtZwjSansPresSel}
+KeyCap = \u20E3
+KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
+
+// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
+AccidentalEmoji = [©®™\u3030\u303D]
+EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
+
+// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
+// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
+// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
+EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
+
+EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
+
+EmojiCharEx         = {EmojiChar}           {ExtFmtZwjSansPresSel}
+EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
+EmojiModifierEx     = {Emoji_Modifier}      {ExtFmtZwjSansPresSel}
+
+EmojiPresentationSelector = \uFE0F
+EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
+TagSpec = [\u{E0020}-\u{E007E}]
+TagTerm = \u{E007F}
+
+// End Emoji Macros
+//////////////////////////////////////////////////////////////////////////
+
+
+// UAX#29 WB4.  X (Extend | Format | ZWJ)* --> X
+//
+ExtFmtZwj           = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
+
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] {ExtFmtZwj}
+AHLetterEx          = [\p{WB:ALetter}\p{WB:Hebrew_Letter}]                      {ExtFmtZwj}
+NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        {ExtFmtZwj}
+KatakanaEx          = \p{WB:Katakana}                                           {ExtFmtZwj} 
+MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      {ExtFmtZwj} 
+MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         {ExtFmtZwj}
+ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       {ExtFmtZwj}
+HanEx               = \p{Script:Han}                                            {ExtFmtZwj}
+HiraganaEx          = \p{Script:Hiragana}                                       {ExtFmtZwj}
+SingleQuoteEx       = \p{WB:Single_Quote}                                       {ExtFmtZwj}
+DoubleQuoteEx       = \p{WB:Double_Quote}                                       {ExtFmtZwj}
+HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      {ExtFmtZwj}
+RegionalIndicatorEx = \p{WB:Regional_Indicator}                                 {ExtFmtZwj}
+ComplexContextEx    = \p{LB:Complex_Context}                                    {ExtFmtZwj}

 %{
  /** Alphanumeric sequences */
@ -93,6 +139,9 @@ ComplexContextEx    = \p{LB:Complex_Context}

  /** Hangul token type */
  public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
+  
+  /** Emoji token type */
+  public static final int EMOJI_TYPE = StandardTokenizer.EMOJI;

  /** Character count processed so far */
  public final int yychar()
@ -120,18 +169,64 @@ ComplexContextEx    = \p{LB:Complex_Context}

 %%

-// UAX#29 WB1.   sot   ÷
-//        WB2.     ÷   eot
+// UAX#29 WB1.    sot ÷ Any
+//        WB2.    Any ÷ eot
 //
 <<EOF>> { return YYEOF; }

-// UAX#29 WB8.   Numeric × Numeric
-//        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
-//        WB12.  Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
-//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-//        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
+// Instead of these: UAX#29 WB3c. ZWJ × (Glue_After_Zwj | EBG)
+//                          WB14. (E_Base | EBG) × E_Modifier
+//                          WB15. ^ (RI RI)* RI × RI
+//                          WB16. [^RI] (RI RI)* RI × RI
 //
-{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}* 
+// We use the "emoji_sequence" rule from http://www.unicode.org/reports/tr51/tr51-14.html (Unicode 11.0)
+// and the Emoji data from http://unicode.org/Public/emoji/11.0/emoji-data.txt (in included file UnicodeEmojiProperties.jflex)
+// 
+// emoji_sequence :=
+//    Top-level EBNF           Expanded #1                       Expanded #2                       Expanded #3
+//    ---------------------    ----------------------------      -----------------------------     ----------------------------------------------
+//      emoji_core_sequence      emoji_combining_sequence          emoji_character                 ( \p{Emoji}
+//                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+//                                                               | emoji_keycap_sequence           | [0-9#*] \u{FE0F 20E3}      [1]
+//                             | emoji_modifier_sequence                                           | \p{Emoji_Modifier_Base} \p{Emoji_Modifier}
+//                             | emoji_flag_sequence                                               | \p{WB:Regional_Indicator}{2}               )
+//
+//    | emoji_zwj_sequence       emoji_zwj_element                 emoji_character                 ( \p{Emoji}
+//                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+//                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+//                             ( ZWJ emoji_zwj_element )+                                          ( \p{WB:ZWJ} ^^ )+
+// 
+//    | emoji_tag_sequence     tag_base                            emoji_character                 ( \p{Emoji}
+//                                                               | emoji_presentation_sequence     | \p{Emoji} \uFE0F
+//                                                               | emoji_modifier_sequence         | \p{Emoji_Modifier_Base} \p{Emoji_Modifier} )
+//                             tag_spec                                                            [\u{E0020}-\u{E007E}]+
+//                             tag_term                                                            \u{E007F}
+//
+// [1] https://unicode.org/Public/emoji/11.0/emoji-test.txt includes key cap sequences 
+//     WITHOUT \uFE0F (emoji presentation indicator), annotating them as "non-fully-qualified";
+//     TR#51 says about non-fully-qualified *ZWJ sequences* that implementations may
+//     choose whether to support them for segmentation.  This implementation will
+//     recognize /[0-9#*]\u20E3/ - i.e. without \uFE0F - as Emoji. 
+//
+// See also: http://www.unicode.org/L2/L2016/16315-handling-seg-emoji.pdf
+//           https://docs.google.com/document/d/1yDZ5TUZNVVKaM9zYCCLbRIAKGNZANsAGl0bcNzGGvn8
+//
+//     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
+//
+//         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+//
+  {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
+| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
+| {RegionalIndicatorEx}{2} 
+  { return EMOJI_TYPE; }
+
+// UAX#29 WB8.    Numeric × Numeric
+//        WB11.   Numeric (MidNum | MidNumLetQ) × Numeric
+//        WB12.   Numeric × (MidNum | MidNumLetQ) Numeric
+//        WB13a.  (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b.  ExtendNumLet × (AHLetter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
  { return NUMERIC_TYPE; }

 // subset of the below for typing purposes only!
@ -141,28 +236,28 @@ ComplexContextEx    = \p{LB:Complex_Context}
 {KatakanaEx}+
  { return KATAKANA_TYPE; }

-// UAX#29 WB5.   (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
-//        WB6.   (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-//        WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
-//        WB7a.  Hebrew_Letter × Single_Quote
-//        WB7b.  Hebrew_Letter × Double_Quote Hebrew_Letter
-//        WB7c.  Hebrew_Letter Double_Quote × Hebrew_Letter
-//        WB9.   (ALetter | Hebrew_Letter) × Numeric
-//        WB10.  Numeric × (ALetter | Hebrew_Letter)
-//        WB13.  Katakana × Katakana
-//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-//        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
+// UAX#29 WB5.    AHLetter × AHLetter
+//        WB6.    AHLetter × (MidLetter | MidNumLetQ) AHLetter
+//        WB7.    AHLetter (MidLetter | MidNumLetQ) × AHLetter
+//        WB7a.   Hebrew_Letter × Single_Quote
+//        WB7b.   Hebrew_Letter × Double_Quote Hebrew_Letter
+//        WB7c.   Hebrew_Letter Double_Quote × Hebrew_Letter
+//        WB9.    AHLetter × Numeric
+//        WB10.   Numeric × AHLetter
+//        WB13.   Katakana × Katakana
+//        WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b.  ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
 //
-{ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+{ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                     | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                     )+
                   )
-({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                           )*
-                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx}    )
-                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}         )*
-                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {HebrewOrALetterEx} )*
+({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                        )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}  {HebrewLetterEx} )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx}      )*
+                     | {AHLetterEx}        ( ( {ExtendNumLetEx}* | {MidLetterEx}  ) {AHLetterEx}     )*
                     )+
                   )
 )*
@ -172,13 +267,13 @@ ComplexContextEx    = \p{LB:Complex_Context}

 // From UAX #29:
 //
-//    [C]haracters with the Line_Break property values of Contingent_Break (CB), 
-//    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word 
+//    [C]haracters with the Line_Break property values of Contingent_Break (CB),
+//    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
 //    boundary property values based on criteria outside of the scope of this
 //    annex.  That means that satisfactory treatment of languages like Chinese
 //    or Thai requires special handling.
 // 
-// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 9.0, only one character has the \p{Line_Break = Contingent_Break}
 // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
 //
 // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -191,17 +286,14 @@ ComplexContextEx    = \p{LB:Complex_Context}
 //
 {ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }

-// UAX#29 WB14.  Any ÷ Any
+// UAX#29 WB999.  Any ÷ Any
 //
 {HanEx} { return IDEOGRAPHIC_TYPE; }
 {HiraganaEx} { return HIRAGANA_TYPE; }

-
-// UAX#29 WB3.   CR × LF
-//        WB3a.  (Newline | CR | LF) ÷
-//        WB3b.  ÷ (Newline | CR | LF)
-//        WB13c. Regional_Indicator × Regional_Indicator
-//        WB14.  Any ÷ Any
+// UAX#29 WB3.    CR × LF
+//        WB3a.   (Newline | CR | LF) ÷
+//        WB3b.   ÷ (Newline | CR | LF)
+//        WB999.  Any ÷ Any
 //
-{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
-  { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, emoji or SE Asian -- ignore it. */ }
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@ -18,8 +18,11 @@ package org.apache.lucene.analysis.standard;


 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 import java.util.Random;

 import org.apache.lucene.analysis.Analyzer;
@ -27,6 +30,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.TestUtil;

@ -282,7 +286,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
  }
  
  public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
+    WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
    wordBreakTest.test(a);
  }
  
@ -358,8 +362,80 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] { "3_1", "2" });
  }

+  /** simple emoji */
+  public void testEmoji() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+        new String[] { "💩", "💩", "💩" },
+        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+  }

+  /** emoji zwj sequence */
+  public void testEmojiSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
+        new String[] { "👩‍❤️‍👩" },
+        new String[] { "<EMOJI>" });
+  }

+  /** emoji zwj sequence with fitzpatrick modifier */
+  public void testEmojiSequenceWithModifier() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
+        new String[] { "👨🏼‍⚕️" },
+        new String[] { "<EMOJI>" });
+  }
+
+  /** regional indicator */
+  public void testEmojiRegionalIndicator() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+        new String[] { "🇺🇸", "🇺🇸" },
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  /** variation sequence */
+  public void testEmojiVariationSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+        new String[] { "#️⃣" },
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+        new String[] { "3️⃣",},
+        new String[] { "<EMOJI>" });
+
+    // text presentation sequences
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
+        new String[] { },
+        new String[] { });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E",  // \uFE0E is included in \p{WB:Extend}
+        new String[] { "3\uFE0E",},
+        new String[] { "<NUM>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E",     // \u2B55 = HEAVY BLACK CIRCLE
+        new String[] { "\u2B55",},
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
+        new String[] { "\u2B55", "\u200D\u2B55"},
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+
+  public void testEmojiTagSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
+        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
+        new String[] { "<EMOJI>" });
+  }
+
+  public void testEmojiTokenization() throws Exception {
+    // simple emoji around latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+        new String[] { "poo", "💩", "poo" },
+        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+    // simple emoji around non-latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+        new String[] { "💩", "中", "國", "💩" },
+        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+  }
+  
+  public void testUnicodeEmojiTests() throws Exception {
+    EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
+    emojiTest.test(a);
+  }
+  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
@ -416,4 +492,53 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
    assertAnalyzesTo(a, "ab cd toolong xy z", new String[]{"ab", "cd", "toolo", "ng", "xy", "z"});
    a.close();
  }
+
+  public void testSplitSurrogatePairWithSpoonFeedReader() throws Exception {
+    String text = "12345678\ud800\udf00"; // U+D800 U+DF00 = U+10300 = 𐌀 (OLD ITALIC LETTER A)
+    
+    // Collect tokens with normal reader
+    StandardAnalyzer a = new StandardAnalyzer();
+    TokenStream ts = a.tokenStream("dummy", text);
+    List<String> tokens = new ArrayList<>();
+    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+    ts.reset();
+    while (ts.incrementToken()) {
+      tokens.add(termAtt.toString());
+    }
+    ts.end();
+    ts.close();
+
+    // Tokens from a spoon-feed reader should be the same as from a normal reader
+    // The 9th char is a high surrogate, so the 9-max-chars spoon-feed reader will split the surrogate pair at a read boundary
+    Reader reader = new SpoonFeedMaxCharsReaderWrapper(9, new StringReader(text));
+    ts = a.tokenStream("dummy", reader);
+    termAtt = ts.addAttribute(CharTermAttribute.class);
+    ts.reset();
+    for (int tokenNum = 0 ; ts.incrementToken() ; ++tokenNum) {
+      assertEquals("token #" + tokenNum + " mismatch: ", termAtt.toString(), tokens.get(tokenNum));
+    }
+    ts.end();
+    ts.close();
+  }
+}
+
+class SpoonFeedMaxCharsReaderWrapper extends Reader {
+  private final Reader in;
+  private final int maxChars; 
+
+  public SpoonFeedMaxCharsReaderWrapper(int maxChars, Reader in) {
+    this.in = in;
+    this.maxChars = maxChars;
+  }
+
+  @Override
+  public void close() throws IOException {
+    in.close();
+  }
+
+  /** Returns the configured number of chars if available */
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return in.read(cbuf, off, Math.min(maxChars, len));
+  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/EmojiTokenizationTestUnicode_11_0.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/EmojiTokenizationTestUnicode_11_0.java
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/WordBreakTestUnicode_9_0_0.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/WordBreakTestUnicode_9_0_0.java
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateEmojiTokenizationTest.pl
@ -0,0 +1,150 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use warnings;
+use strict;
+use File::Spec;
+use Getopt::Long;
+use LWP::UserAgent;
+
+my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
+
+my $version = '';
+unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
+    print STDERR "Usage: $script_name -v <version>\n";
+    print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n"
+        if ($version);
+    exit 1;
+}
+my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt";
+my $underscore_version = $version;
+$underscore_version =~ s/\./_/g;
+my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}";
+my $output_filename = "${class_name}.java";
+my $header =<<"__HEADER__";
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.standard;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.junit.Ignore;
+
+/**
+ * This class was automatically generated by ${script_name}
+ * from: ${url}
+ *
+ * emoji-test.txt contains emoji char sequences, which are represented as
+ * tokenization tests in this class.
+ * 
+ */
+\@Ignore
+public class ${class_name} extends BaseTokenStreamTestCase {
+
+  public void test(Analyzer analyzer) throws Exception {
+    for (int i = 0 ; i < tests.length ; i += 2) {
+      String test = tests[i + 1];
+      try {
+        assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
+      } catch (Throwable t) {
+        throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);        
+      }
+    }
+  }
+
+  private String[] tests = new String[] {
+__HEADER__
+
+my @tests = split /\r?\n/, get_URL_content($url);
+
+my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
+open OUT, ">$output_path"
+    || die "Error opening '$output_path' for writing: $!";
+
+print STDERR "Writing '$output_path'...";
+
+print OUT $header;
+
+my $isFirst = 1;
+for my $line (@tests) {
+    next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
+
+    print OUT ",\n\n" unless $isFirst;
+    $isFirst = 0;
+
+    # Example line: 1F46E 1F3FB 200D 2642 FE0F                 ; fully-qualified     # 👮🏻‍♂️ man police officer: light skin tone
+    $line =~ s/\s+$//;     # Trim trailing whitespace
+    $line =~ s/\t/  /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
+    print OUT "    \"$line\",\n";
+    my ($test_string) = $line =~ /^(.*?)\s*;/;
+    $test_string =~ s/([0-9A-F]+)/\\u$1/g;
+    $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
+    $test_string =~ s/\s//g;
+    print OUT "    \"${test_string}\"";
+}
+print OUT "  };\n}\n";
+close OUT;
+print STDERR "done.\n";
+
+
+# sub above_BMP_char_to_surrogates
+#
+# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
+# to the corresponding UTF-16 surrogate pair
+#
+# Assumption: input string is a sequence more than four hex digits
+#
+sub above_BMP_char_to_surrogates {
+    my $ch = hex(shift);
+    my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
+    my $low_surrogate  = 0xDC00 + ($ch & 0x3FF);
+    return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
+}
+
+
+# sub get_URL_content
+#
+# Retrieves and returns the content of the given URL.
+#
+sub get_URL_content {
+    my $url = shift;
+    print STDERR "Retrieving '$url'...";
+    my $user_agent = LWP::UserAgent->new;
+    my $request = HTTP::Request->new(GET => $url);
+    my $response = $user_agent->request($request);
+    unless ($response->is_success) {
+        print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
+        exit 1;
+    }
+    print STDERR "done.\n";
+    return $response->content;
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl
@ -40,8 +40,6 @@ $underscore_version =~ s/\./_/g;
 my $class_name = "WordBreakTestUnicode_${underscore_version}";
 my $output_filename = "${class_name}.java";
 my $header =<<"__HEADER__";
-package org.apache.lucene.analysis;
-
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -59,6 +57,8 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

+package org.apache.lucene.analysis.standard;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.junit.Ignore;
@ -81,7 +81,7 @@ import org.junit.Ignore;
 *    \\p{WordBreak = Hebrew_Letter}
 *    \\p{WordBreak = Katakana}
 *    \\p{WordBreak = Numeric}         (Excludes full-width Arabic digits)
- *    [\\uFF10-\\uFF19]                (Full-width Arabic digits)
+ *    [\\uFF10-\\uFF19]                 (Full-width Arabic digits)
 */
 \@Ignore
 public class ${class_name} extends BaseTokenStreamTestCase {
@ -91,6 +91,7 @@ __HEADER__

 my $codepoints = [];
 map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
+my $regional_indicator_codepoints = [];
 # Complex_Context is an alias for 'SA', which is used in LineBreak.txt
 # Using lowercase versions of property value names to allow for case-
 # insensitive comparison with the names in the Unicode data files.
@ -98,7 +99,9 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
 parse_Unicode_data_file($scripts_url, $codepoints, 
                        {'han' => 1, 'hiragana' => 1});
 parse_Unicode_data_file($word_break_url, $codepoints,
-                        {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
+                        {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
+                         'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
+parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
 my @tests = split /\r?\n/, get_URL_content($word_break_test_url);

 my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@ -124,10 +127,21 @@ for my $line (@tests) {
  $test_string =~ s/\\u000D/\\r/g;
  $test_string =~ s/\\u0022/\\\"/g;
  $sequence =~ s/^\s*÷\s*//; # Trim leading break character
+  
+  # TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
+  # ÷ 200D ÷ 261D ÷  #  ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
+  if ($sequence =~ /^200D\s*÷\s*261D$/) {
+    print OUT "    // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
+    next;
+  }
+  
  my @tokens = ();
+  my $isfirst = 0;
  for my $candidate (split /\s*÷\s*/, $sequence) {
+    $isfirst = 1;
    my @chars = ();
-    my $has_wanted_char = 0;
+    my $has_wanted_chars = 0;
+    my $prev_char_regional_indicator = 0;
    while ($candidate =~ /([0-9A-F]+)/gi) {
      my $hexchar = $1;
      if (4 == length($hexchar)) {
@ -135,12 +149,21 @@ for my $line (@tests) {
      } else {
        push @chars, above_BMP_char_to_surrogates($hexchar);
      }
-      unless ($has_wanted_char) {
-        $has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
+      unless ($has_wanted_chars) {
+        my $codepoint = hex($hexchar);
+        if (defined($codepoints->[$codepoint])) {
+          $has_wanted_chars = 1;
+        } elsif (defined($regional_indicator_codepoints->[$codepoint])) {
+          if (1 == $prev_char_regional_indicator) {
+            $has_wanted_chars = 1; # must be 2 regional indicators in a row
+          } else {
+            $prev_char_regional_indicator = 1;
+          }
+        }
      }
    }
-    if ($has_wanted_char) {
-      push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
+    if ($has_wanted_chars) {
+      push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
    }
  }
  print OUT "    assertAnalyzesTo(analyzer, \"${test_string}\",\n";