LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043071 13f79535-47bb-0310-9956-ffa450edef68
2010-12-07 14:53:13 +00:00 · 2010-12-07 14:53:13 +00:00 · 2b9726ae81
parent 5b2e0f786b
commit 2b9726ae81
19 changed files with 3560 additions and 3461 deletions
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@ -9,15 +9,17 @@ API Changes
 * LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous.  (Robert Muir)
- * LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
+ * LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in 
-   the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
+   common/standard/ now implement the Word Break rules from the Unicode 6.0.0
-   as well as tokenizing URLs and email addresses according to the relevant
+   Text Segmentation algorithm (UAX#29).  
   RFCs.  ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
   behavior.  (Steven Rowe, Robert Muir, Uwe Schindler)
 * LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
   (Steven Rowe)
   ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
   implementation and behavior.
   UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
   relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
   (Steven Rowe, Robert Muir, Uwe Schindler)
 * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
   can be generated. (Chris Harris via Steven Rowe)
--- a/modules/analysis/common/build.xml
+++ b/modules/analysis/common/build.xml
@ -38,7 +38,7 @@
  <target name="compile-core" depends="jflex-notice, common.compile-core"/>
-  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
+  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
  <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@ -62,11 +62,11 @@
           nobak="on" />
  </target>
-  <target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
+  <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
 			<classpath refid="jflex.classpath"/>
    </taskdef>
-    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
           outdir="src/java/org/apache/lucene/analysis/standard"
           nobak="on" />
  </target>
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
@ -15,8 +15,8 @@
 */
 // Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
+// file version from Saturday, December 4, 2010 12:34:19 PM UTC
-// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
+// generated on Sunday, December 5, 2010 12:24:12 AM UTC
 // by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
 ASCIITLD = "." (
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
 package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
+ * on 12/4/10 7:24 PM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
 */
 class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -630,6 +630,12 @@ public final void getText(CharTermAttribute t) {
      zzState = ZZ_LEXSTATE[zzLexicalState];
      // set up zzAction for empty match case:
      int zzAttributes = zzAttrL[zzState];
      if ( (zzAttributes & 1) == 1 ) {
        zzAction = zzState;
      }
      zzForAction: {
        while (true) {
@ -662,7 +668,7 @@ public final void getText(CharTermAttribute t) {
          if (zzNext == -1) break zzForAction;
          zzState = zzNext;
-          int zzAttributes = zzAttrL[zzState];
+          zzAttributes = zzAttrL[zzState];
          if ( (zzAttributes & 1) == 1 ) {
            zzAction = zzState;
            zzMarkedPosL = zzCurrentPosL;
@ -676,45 +682,45 @@ public final void getText(CharTermAttribute t) {
      zzMarkedPos = zzMarkedPosL;
      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
        case 10: 
          { return EMAIL;
          }
        case 11: break;
        case 2: 
          { return ALPHANUM;
          }
        case 12: break;
        case 4: 
          { return HOST;
          }
        case 13: break;
        case 1: 
          { /* ignore */
          }
        case 14: break;
        case 8: 
          { return ACRONYM_DEP;
          }
        case 15: break;
        case 5: 
          { return NUM;
          }
-        case 16: break;
+        case 11: break;
        case 9: 
          { return ACRONYM;
          }
-        case 17: break;
+        case 12: break;
        case 7: 
          { return COMPANY;
          }
-        case 18: break;
+        case 13: break;
        case 10: 
          { return EMAIL;
          }
        case 14: break;
        case 1: 
          { /* ignore */
          }
        case 15: break;
        case 6: 
          { return APOSTROPHE;
          }
-        case 19: break;
+        case 16: break;
        case 3: 
          { return CJ;
          }
        case 17: break;
        case 8: 
          { return ACRONYM_DEP;
          }
        case 18: break;
        case 2: 
          { return ALPHANUM;
          }
        case 19: break;
        case 4: 
          { return HOST;
          }
        case 20: break;
        default: 
          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
@ -16,6 +16,6 @@
 */
-WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
+WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
-      the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
+      and need to regenerate the tokenizer, only use the trunk version
-      SVN revision 597) at the moment!
+      of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -83,10 +83,9 @@ public final class StandardTokenizer extends Tokenizer {
  @Deprecated
  public static final int ACRONYM_DEP       = 8;
-  public static final int URL = 9;
+  public static final int SOUTHEAST_ASIAN = 9;
-  public static final int SOUTHEAST_ASIAN = 10;
+  public static final int IDEOGRAPHIC = 10;
-  public static final int IDEOGRAPHIC = 11;
+  public static final int HIRAGANA = 11;
  public static final int HIRAGANA = 12;
  /** String token types that correspond to token type int constants */
  public static final String [] TOKEN_TYPES = new String [] {
@ -99,7 +98,6 @@ public final class StandardTokenizer extends Tokenizer {
    "<NUM>",
    "<CJ>",
    "<ACRONYM_DEP>",
    "<URL>",
    "<SOUTHEAST_ASIAN>",
    "<IDEOGRAPHIC>",
    "<HIRAGANA>"
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@ -23,14 +23,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 * This class implements Word Break rules from the Unicode Text Segmentation 
 * algorithm, as specified in 
 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
 * URLs and email addresses are also tokenized according to the relevant RFCs.
 * <p/>
 * Tokens produced are of the following types:
 * <ul>
 *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
 *   <li>&lt;NUM&gt;: A number</li>
 *   <li>&lt;URL&gt;: A URL</li>
 *   <li>&lt;EMAIL&gt;: An email address</li>
 *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
 *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
 *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -67,83 +64,6 @@ MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]
 ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
 // URL and E-mail syntax specifications:
 //
 //     RFC-952:  DOD INTERNET HOST TABLE SPECIFICATION
 //     RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
 //     RFC-1123: Requirements for Internet Hosts - Application and Support
 //     RFC-1738: Uniform Resource Locators (URL)
 //     RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
 //     RFC-5234: Augmented BNF for Syntax Specifications: ABNF
 //     RFC-5321: Simple Mail Transfer Protocol
 //     RFC-5322: Internet Message Format
 %include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
 DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
 DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
 DomainNameLoose  = {DomainLabel} ("." {DomainLabel})*
 IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
 IPv4Address  = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3} 
 IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
 IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
 IPv6Address =                                                  ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
            |                                             "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
            |                            {IPv6Hex16Bit}?  "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::"  {IPv6Hex16Bit} ":"     {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::"                         {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::"                         {IPv6Hex16Bit}
            | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
 URIunreserved = [-._~A-Za-z0-9]
 URIpercentEncoded = "%" [0-9A-Fa-f]{2}
 URIsubDelims = [!$&'()*+,;=]
 URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
 URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
 URIquery    = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
 URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
 URIport = ":" [0-9]{1,5}
 URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}  
 URIhostLoose  = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose} 
 URIauthorityStrict =             {URIhostStrict} {URIport}?
 URIauthorityLoose  = {URIlogin}? {URIhostLoose}  {URIport}?
 HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
 HTTPpath = ("/" {HTTPsegment})*
 HTTPscheme = [hH][tT][tT][pP][sS]? "://"
 HTTPurlFull = {HTTPscheme} {URIauthorityLoose}  {HTTPpath}? {URIquery}? {URIfragment}?
 // {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
 HTTPurlNoScheme =          {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
 HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
 FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
 FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
 FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
 FTPscheme = [fF][tT][pP] "://"
 FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
 FILEscheme = [fF][iI][lL][eE] "://"
 FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
 URL = {HTTPurl} | {FTPurl} | {FILEurl}
 EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
 EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
 EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
 EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
 EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
 // DFA minimization allows {IPv6Address} and {IPv4Address} to be included 
 // in the {EMAILbracketedHost} definition without incurring any size penalties, 
 // since {EMAILdomainLiteralText} recognizes all valid IP addresses.
 // The IP address regexes are included in {EMAILbracketedHost} simply as a 
 // reminder that they are acceptable bracketed host forms.
 EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
 EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 %{
  /** Alphanumeric sequences */
  public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
@ -151,12 +71,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
  /** Numbers */
  public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
  /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
  public static final int URL_TYPE = StandardTokenizer.URL;
  /** E-mail addresses */
  public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
  /**
   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
@ -191,9 +105,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 //
 <<EOF>> { return StandardTokenizerInterface.YYEOF; }
 {URL}   { return URL_TYPE; }
 {EMAIL} { return EMAIL_TYPE; }
 // UAX#29 WB8.   Numeric × Numeric
 //        WB11.  Numeric (MidNum | MidNumLet) × Numeric
 //        WB12.  Numeric × (MidNum | MidNumLet) Numeric
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
@ -1,847 +0,0 @@
 /* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
 package org.apache.lucene.analysis.standard;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 /**
 * This class implements Word Break rules from the Unicode Text Segmentation 
 * algorithm, as specified in 
 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
 * <p/>
 * Tokens produced are of the following types:
 * <ul>
 *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
 *   <li>&lt;NUM&gt;: A number</li>
 *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
 *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
 *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
 *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
 * </ul>
 * <b>WARNING</b>: Because JFlex does not support Unicode supplementary 
 * characters (characters above the Basic Multilingual Plane, which contains
 * those up to and including U+FFFF), this scanner will not recognize them
 * properly.  If you need to be able to process text containing supplementary 
 * characters, consider using the ICU4J-backed implementation in modules/analysis/icu  
 * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
 * instead of this class, since the ICU4J-backed implementation does not have
 * this limitation.
 */
 public final class UAX29Tokenizer extends Tokenizer {
  /** This character denotes the end of file */
  private static final int YYEOF = -1;
  /** initial size of the lookahead buffer */
  private static final int ZZ_BUFFERSIZE = 16384;
  /** lexical states */
  private static final int YYINITIAL = 0;
  /**
   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
   *                  at the beginning of a line
   * l is of the form l = 2*k, k a non negative integer
   */
  private static final int ZZ_LEXSTATE[] = { 
     0, 0
  };
  /** 
   * Translates characters to character classes
   */
  private static final String ZZ_CMAP_PACKED = 
    "\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
    "\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
    "\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
    "\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
    "\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
    "\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
    "\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
    "\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
    "\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
    "\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
    "\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
    "\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
    "\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
    "\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
    "\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
    "\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
    "\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
    "\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
    "\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
    "\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
    "\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
    "\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
    "\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
    "\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
    "\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
    "\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
    "\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
    "\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
    "\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
    "\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
    "\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
    "\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
    "\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
    "\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
    "\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
    "\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
    "\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
    "\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
    "\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
    "\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
    "\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
    "\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
    "\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
    "\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
    "\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
    "\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
    "\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
    "\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
    "\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
    "\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
    "\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
    "\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
    "\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
    "\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
    "\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
    "\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
    "\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
    "\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
    "\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
    "\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
    "\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
    "\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
    "\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
    "\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
    "\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
    "\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
    "\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
    "\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
    "\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
    "\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
    "\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
    "\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
    "\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
    "\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
    "\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
    "\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
    "\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
    "\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
    "\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
    "\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
    "\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
    "\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
    "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
    "\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
    "\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
    "\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
    "\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
    "\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
    "\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
    "\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
    "\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
    "\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
    "\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
    "\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
    "\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
    "\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
    "\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
    "\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
    "\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
    "\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
    "\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
    "\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
    "\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
    "\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
    "\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
    "\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
    "\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
    "\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
    "\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
  /** 
   * Translates characters to character classes
   */
  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
  /** 
   * Translates DFA states to action switch labels.
   */
  private static final int [] ZZ_ACTION = zzUnpackAction();
  private static final String ZZ_ACTION_PACKED_0 =
    "\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
    "\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
  private static int [] zzUnpackAction() {
    int [] result = new int[16];
    int offset = 0;
    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
    return result;
  }
  private static int zzUnpackAction(String packed, int offset, int [] result) {
    int i = 0;       /* index in packed string  */
    int j = offset;  /* index in unpacked array */
    int l = packed.length();
    while (i < l) {
      int count = packed.charAt(i++);
      int value = packed.charAt(i++);
      do result[j++] = value; while (--count > 0);
    }
    return j;
  }
  /** 
   * Translates a state to a row index in the transition table
   */
  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
  private static final String ZZ_ROWMAP_PACKED_0 =
    "\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
    "\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
  private static int [] zzUnpackRowMap() {
    int [] result = new int[16];
    int offset = 0;
    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
    return result;
  }
  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
    int i = 0;  /* index in packed string  */
    int j = offset;  /* index in unpacked array */
    int l = packed.length();
    while (i < l) {
      int high = packed.charAt(i++) << 16;
      result[j++] = high | packed.charAt(i++);
    }
    return j;
  }
  /** 
   * The transition table of the DFA
   */
  private static final int [] ZZ_TRANS = zzUnpackTrans();
  private static final String ZZ_TRANS_PACKED_0 =
    "\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
    "\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
    "\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
    "\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
    "\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
    "\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
    "\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
    "\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
    "\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
    "\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
    "\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
    "\2\0";
  private static int [] zzUnpackTrans() {
    int [] result = new int[169];
    int offset = 0;
    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
    return result;
  }
  private static int zzUnpackTrans(String packed, int offset, int [] result) {
    int i = 0;       /* index in packed string  */
    int j = offset;  /* index in unpacked array */
    int l = packed.length();
    while (i < l) {
      int count = packed.charAt(i++);
      int value = packed.charAt(i++);
      value--;
      do result[j++] = value; while (--count > 0);
    }
    return j;
  }
  /* error codes */
  private static final int ZZ_UNKNOWN_ERROR = 0;
  private static final int ZZ_NO_MATCH = 1;
  private static final int ZZ_PUSHBACK_2BIG = 2;
  /* error messages for the codes above */
  private static final String ZZ_ERROR_MSG[] = {
    "Unkown internal scanner error",
    "Error: could not match input",
    "Error: pushback value was too large"
  };
  /**
   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
   */
  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
  private static final String ZZ_ATTRIBUTE_PACKED_0 =
    "\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
    "\1\1\2\0";
  private static int [] zzUnpackAttribute() {
    int [] result = new int[16];
    int offset = 0;
    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
    return result;
  }
  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
    int i = 0;       /* index in packed string  */
    int j = offset;  /* index in unpacked array */
    int l = packed.length();
    while (i < l) {
      int count = packed.charAt(i++);
      int value = packed.charAt(i++);
      do result[j++] = value; while (--count > 0);
    }
    return j;
  }
  /** the input device */
  private java.io.Reader zzReader;
  /** the current state of the DFA */
  private int zzState;
  /** the current lexical state */
  private int zzLexicalState = YYINITIAL;
  /** this buffer contains the current text to be matched and is
      the source of the yytext() string */
  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
  /** the textposition at the last accepting state */
  private int zzMarkedPos;
  /** the current text position in the buffer */
  private int zzCurrentPos;
  /** startRead marks the beginning of the yytext() string in the buffer */
  private int zzStartRead;
  /** endRead marks the last character in the buffer, that has been read
      from input */
  private int zzEndRead;
  /** number of newlines encountered up to the start of the matched text */
  private int yyline;
  /** the number of characters up to the start of the matched text */
  private int yychar;
  /**
   * the number of characters from the last newline up to the start of the 
   * matched text
   */
  private int yycolumn;
  /** 
   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
   */
  private boolean zzAtBOL = true;
  /** zzAtEOF == true <=> the scanner is at the EOF */
  private boolean zzAtEOF;
  /** denotes if the user-EOF-code has already been executed */
  private boolean zzEOFDone;
  /* user code: */
  /** Alphanumeric sequences */
  public static final String WORD_TYPE = "<ALPHANUM>";
  /** Numbers */
  public static final String NUMERIC_TYPE = "<NUM>";
  /**
   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
   * together as as a single token rather than broken up, because the logic
   * required to break them at word boundaries is too complex for UAX#29.
   * <p>
   * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
   */
  public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
  public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
  public static final String HIRAGANA_TYPE = "<HIRAGANA>";
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncrAtt 
    = addAttribute(PositionIncrementAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
  private int posIncr;
  /**
   * @param source The AttributeSource to use
   * @param input The input reader
   */
  public UAX29Tokenizer(AttributeSource source, Reader input) {
    super(source, input);
    zzReader = input;
  }
  /**
   * @param factory The AttributeFactory to use
   * @param input The input reader
   */
  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
    super(factory, input); 
    zzReader = input;
  }
  /** 
   * Set the max allowed token length.  Any token longer than this is skipped.
   * @param length the new max allowed token length
   */
  public void setMaxTokenLength(int length) {
    this.maxTokenLength = length;
  }
  /**
   * Returns the max allowed token length.  Any token longer than this is 
   * skipped.
   * @return the max allowed token length 
   */
  public int getMaxTokenLength() {
    return maxTokenLength;
  }
  @Override
  public final void end() {
    // set final offset
    int finalOffset = correctOffset(yychar + yylength());
    offsetAtt.setOffset(finalOffset, finalOffset);
  }
  @Override
  public void reset(Reader reader) throws IOException {
    super.reset(reader);
    yyreset(reader);
  }
  @Override
  public final boolean incrementToken() throws IOException {
    // This method is required because of two JFlex limitations:
    // 1. No way to insert code at the beginning of the generated scanning
    //    get-next-token method; and
    // 2. No way to declare @Override on the generated scanning method.
    clearAttributes();
    posIncr = 1;
    return getNextToken();
  }
  /**
   * Populates this TokenStream's CharTermAttribute and OffsetAttribute from
   * the current match, the TypeAttribute from the passed-in tokenType, and
   * the PositionIncrementAttribute to one, unless the immediately previous
   * token(s) was/were skipped because maxTokenLength was exceeded, in which
   * case the PositionIncrementAttribute is set to one plus the number of
   * skipped overly long tokens. 
   * <p/> 
   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
   * and false is returned.
   * 
   * @param tokenType The type of the matching token
   * @return true there is a token available (not too long); false otherwise 
   */
  private boolean populateAttributes(String tokenType) {
    boolean isTokenAvailable = false;
    if (yylength() > maxTokenLength) {
      // When we skip a too-long token, we treat it like a stopword, introducing
      // a position increment gap
      ++posIncr;
    } else {
      termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
      posIncrAtt.setPositionIncrement(posIncr);
      offsetAtt.setOffset(correctOffset(yychar),
                          correctOffset(yychar + yylength()));
      typeAtt.setType(tokenType);
      isTokenAvailable = true;
    }
    return isTokenAvailable;
  }
  /**
   * Creates a new scanner
   * There is also a java.io.InputStream version of this constructor.
   *
   * @param   in  the java.io.Reader to read input from.
   */
  public UAX29Tokenizer(java.io.Reader in) {
    super(in);
    this.zzReader = in;
  }
  /**
   * Creates a new scanner.
   * There is also java.io.Reader version of this constructor.
   *
   * @param   in  the java.io.Inputstream to read input from.
   */
  public UAX29Tokenizer(java.io.InputStream in) {
    this(new java.io.InputStreamReader(in));
  }
  /** 
   * Unpacks the compressed character translation table.
   *
   * @param packed   the packed character translation table
   * @return         the unpacked character translation table
   */
  private static char [] zzUnpackCMap(String packed) {
    char [] map = new char[0x10000];
    int i = 0;  /* index in packed string  */
    int j = 0;  /* index in unpacked array */
    while (i < 2174) {
      int  count = packed.charAt(i++);
      char value = packed.charAt(i++);
      do map[j++] = value; while (--count > 0);
    }
    return map;
  }
  /**
   * Refills the input buffer.
   *
   * @return      <code>false</code>, iff there was new input.
   * 
   * @exception   java.io.IOException  if any I/O-Error occurs
   */
  private boolean zzRefill() throws java.io.IOException {
    /* first: make room (if you can) */
    if (zzStartRead > 0) {
      System.arraycopy(zzBuffer, zzStartRead,
                       zzBuffer, 0,
                       zzEndRead-zzStartRead);
      /* translate stored positions */
      zzEndRead-= zzStartRead;
      zzCurrentPos-= zzStartRead;
      zzMarkedPos-= zzStartRead;
      zzStartRead = 0;
    }
    /* is the buffer big enough? */
    if (zzCurrentPos >= zzBuffer.length) {
      /* if not: blow it up */
      char newBuffer[] = new char[zzCurrentPos*2];
      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
      zzBuffer = newBuffer;
    }
    /* finally: fill the buffer with new input */
    int numRead = zzReader.read(zzBuffer, zzEndRead,
                                            zzBuffer.length-zzEndRead);
    if (numRead > 0) {
      zzEndRead+= numRead;
      return false;
    }
    // unlikely but not impossible: read 0 characters, but not at end of stream    
    if (numRead == 0) {
      int c = zzReader.read();
      if (c == -1) {
        return true;
      } else {
        zzBuffer[zzEndRead++] = (char) c;
        return false;
      }     
    }
 	// numRead < 0
    return true;
  }
  /**
   * Closes the input stream.
   */
  private final void yyclose() throws java.io.IOException {
    zzAtEOF = true;            /* indicate end of file */
    zzEndRead = zzStartRead;  /* invalidate buffer    */
    if (zzReader != null)
      zzReader.close();
  }
  /**
   * Resets the scanner to read from a new input stream.
   * Does not close the old reader.
   *
   * All internal variables are reset, the old input stream 
   * <b>cannot</b> be reused (internal buffer is discarded and lost).
   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
   *
   * Internal scan buffer is resized down to its initial length, if it has grown.
   *
   * @param reader   the new input stream 
   */
  private final void yyreset(java.io.Reader reader) {
    zzReader = reader;
    zzAtBOL  = true;
    zzAtEOF  = false;
    zzEOFDone = false;
    zzEndRead = zzStartRead = 0;
    zzCurrentPos = zzMarkedPos = 0;
    yyline = yychar = yycolumn = 0;
    zzLexicalState = YYINITIAL;
    if (zzBuffer.length > ZZ_BUFFERSIZE)
      zzBuffer = new char[ZZ_BUFFERSIZE];
  }
  /**
   * Returns the current lexical state.
   */
  private final int yystate() {
    return zzLexicalState;
  }
  /**
   * Enters a new lexical state
   *
   * @param newState the new lexical state
   */
  private final void yybegin(int newState) {
    zzLexicalState = newState;
  }
  /**
   * Returns the text matched by the current regular expression.
   */
  private final String yytext() {
    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
  }
  /**
   * Returns the character at position <tt>pos</tt> from the 
   * matched text. 
   * 
   * It is equivalent to yytext().charAt(pos), but faster
   *
   * @param pos the position of the character to fetch. 
   *            A value from 0 to yylength()-1.
   *
   * @return the character at position pos
   */
  private final char yycharat(int pos) {
    return zzBuffer[zzStartRead+pos];
  }
  /**
   * Returns the length of the matched text region.
   */
  private final int yylength() {
    return zzMarkedPos-zzStartRead;
  }
  /**
   * Reports an error that occured while scanning.
   *
   * In a wellformed scanner (no or only correct usage of 
   * yypushback(int) and a match-all fallback rule) this method 
   * will only be called with things that "Can't Possibly Happen".
   * If this method is called, something is seriously wrong
   * (e.g. a JFlex bug producing a faulty scanner etc.).
   *
   * Usual syntax/scanner level error handling should be done
   * in error fallback rules.
   *
   * @param   errorCode  the code of the errormessage to display
   */
  private void zzScanError(int errorCode) {
    String message;
    try {
      message = ZZ_ERROR_MSG[errorCode];
    }
    catch (ArrayIndexOutOfBoundsException e) {
      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
    }
    throw new Error(message);
  } 
  /**
   * Pushes the specified amount of characters back into the input stream.
   *
   * They will be read again by then next call of the scanning method
   *
   * @param number  the number of characters to be read again.
   *                This number must not be greater than yylength()!
   */
  private void yypushback(int number)  {
    if ( number > yylength() )
      zzScanError(ZZ_PUSHBACK_2BIG);
    zzMarkedPos -= number;
  }
  /**
   * Resumes scanning until the next regular expression is matched,
   * the end of input is encountered or an I/O-Error occurs.
   *
   * @return      the next token
   * @exception   java.io.IOException  if any I/O-Error occurs
   */
  private boolean getNextToken() throws java.io.IOException {
    int zzInput;
    int zzAction;
    // cached fields:
    int zzCurrentPosL;
    int zzMarkedPosL;
    int zzEndReadL = zzEndRead;
    char [] zzBufferL = zzBuffer;
    char [] zzCMapL = ZZ_CMAP;
    int [] zzTransL = ZZ_TRANS;
    int [] zzRowMapL = ZZ_ROWMAP;
    int [] zzAttrL = ZZ_ATTRIBUTE;
    while (true) {
      zzMarkedPosL = zzMarkedPos;
      yychar+= zzMarkedPosL-zzStartRead;
      zzAction = -1;
      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
      zzState = ZZ_LEXSTATE[zzLexicalState];
      zzForAction: {
        while (true) {
          if (zzCurrentPosL < zzEndReadL)
            zzInput = zzBufferL[zzCurrentPosL++];
          else if (zzAtEOF) {
            zzInput = YYEOF;
            break zzForAction;
          }
          else {
            // store back cached positions
            zzCurrentPos  = zzCurrentPosL;
            zzMarkedPos   = zzMarkedPosL;
            boolean eof = zzRefill();
            // get translated positions and possibly new buffer
            zzCurrentPosL  = zzCurrentPos;
            zzMarkedPosL   = zzMarkedPos;
            zzBufferL      = zzBuffer;
            zzEndReadL     = zzEndRead;
            if (eof) {
              zzInput = YYEOF;
              break zzForAction;
            }
            else {
              zzInput = zzBufferL[zzCurrentPosL++];
            }
          }
          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
          if (zzNext == -1) break zzForAction;
          zzState = zzNext;
          int zzAttributes = zzAttrL[zzState];
          if ( (zzAttributes & 1) == 1 ) {
            zzAction = zzState;
            zzMarkedPosL = zzCurrentPosL;
            if ( (zzAttributes & 8) == 8 ) break zzForAction;
          }
        }
      }
      // store back cached position
      zzMarkedPos = zzMarkedPosL;
      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
        case 5: 
          { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
          }
        case 7: break;
        case 1: 
          { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
          }
        case 8: break;
        case 3: 
          { if (populateAttributes(NUMERIC_TYPE)) return true;
          }
        case 9: break;
        case 6: 
          { if (populateAttributes(HIRAGANA_TYPE)) return true;
          }
        case 10: break;
        case 4: 
          { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
          }
        case 11: break;
        case 2: 
          { if (populateAttributes(WORD_TYPE)) return true;
          }
        case 12: break;
        default: 
          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
            zzAtEOF = true;
              {
                return false;
              }
          } 
          else {
            zzScanError(ZZ_NO_MATCH);
          }
      }
    }
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeSource;
 * This class implements Word Break rules from the Unicode Text Segmentation 
 * algorithm, as specified in 
 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
 * URLs and email addresses are also tokenized according to the relevant RFCs.
 * <p/>
 * Tokens produced are of the following types:
 * <ul>
 *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
 *   <li>&lt;NUM&gt;: A number</li>
 *   <li>&lt;URL&gt;: A URL</li>
 *   <li>&lt;EMAIL&gt;: An email address</li>
 *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
 *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
 *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeSource;
 %final
 %public
 %apiprivate
-%class UAX29Tokenizer
+%class UAX29URLEmailTokenizer
 %extends Tokenizer
 %type boolean
 %function getNextToken
@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeSource;
  super(in);
 %init}
-// WB4. X (Extend | Format)* --> X
+// UAX#29 WB4. X (Extend | Format)* --> X
 //
 ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
@ -77,6 +80,85 @@ MidLetterEx    = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
 MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
 ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
 // URL and E-mail syntax specifications:
 //
 //     RFC-952:  DOD INTERNET HOST TABLE SPECIFICATION
 //     RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
 //     RFC-1123: Requirements for Internet Hosts - Application and Support
 //     RFC-1738: Uniform Resource Locators (URL)
 //     RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
 //     RFC-5234: Augmented BNF for Syntax Specifications: ABNF
 //     RFC-5321: Simple Mail Transfer Protocol
 //     RFC-5322: Internet Message Format
 %include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
 DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
 DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
 DomainNameLoose  = {DomainLabel} ("." {DomainLabel})*
 IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
 IPv4Address  = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3} 
 IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
 IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
 IPv6Address =                                                  ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
            |                                             "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
            |                            {IPv6Hex16Bit}?  "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::"  {IPv6Hex16Bit} ":"     {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::"                         {IPv6LeastSignificant32Bits}
            | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::"                         {IPv6Hex16Bit}
            | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
 URIunreserved = [-._~A-Za-z0-9]
 URIpercentEncoded = "%" [0-9A-Fa-f]{2}
 URIsubDelims = [!$&'()*+,;=]
 URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
 URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
 URIquery    = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
 URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
 URIport = ":" [0-9]{1,5}
 URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}  
 URIhostLoose  = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose} 
 URIauthorityStrict =             {URIhostStrict} {URIport}?
 URIauthorityLoose  = {URIlogin}? {URIhostLoose}  {URIport}?
 HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
 HTTPpath = ("/" {HTTPsegment})*
 HTTPscheme = [hH][tT][tT][pP][sS]? "://"
 HTTPurlFull = {HTTPscheme} {URIauthorityLoose}  {HTTPpath}? {URIquery}? {URIfragment}?
 // {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
 HTTPurlNoScheme =          {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
 HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
 FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
 FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
 FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
 FTPscheme = [fF][tT][pP] "://"
 FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
 FILEscheme = [fF][iI][lL][eE] "://"
 FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
 URL = {HTTPurl} | {FTPurl} | {FILEurl}
 EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
 EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
 EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
 EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
 EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
 // DFA minimization allows {IPv6Address} and {IPv4Address} to be included 
 // in the {EMAILbracketedHost} definition without incurring any size penalties, 
 // since {EMAILdomainLiteralText} recognizes all valid IP addresses.
 // The IP address regexes are included in {EMAILbracketedHost} simply as a 
 // reminder that they are acceptable bracketed host forms.
 EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
 EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 %{
  /** Alphanumeric sequences */
  public static final String WORD_TYPE = "<ALPHANUM>";
@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
  /** Numbers */
  public static final String NUMERIC_TYPE = "<NUM>";
  /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
  public static final String URL_TYPE = "<URL>";
  /** E-mail addresses */
  public static final String EMAIL_TYPE = "<EMAIL";
  /**
   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
   * @param source The AttributeSource to use
   * @param input The input reader
   */
-  public UAX29Tokenizer(AttributeSource source, Reader input) {
+  public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
    super(source, input);
    zzReader = input;
  }
@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
   * @param factory The AttributeFactory to use
   * @param input The input reader
   */
-  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+  public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
    super(factory, input); 
    zzReader = input;
  }
@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
 %%
-// WB1. 	sot 	÷ 	
+// UAX#29 WB1. 	sot 	÷ 	
-// WB2. 		÷ 	eot
+//        WB2. 		÷ 	eot
 //
 <<EOF>> { return false; }
 {URL}   { if (populateAttributes(URL_TYPE)) return true; }
 {EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
-// WB8.   Numeric × Numeric
+// UAX#29 WB8.   Numeric × Numeric
-// WB11.  Numeric (MidNum | MidNumLet) × Numeric
+//        WB11.  Numeric (MidNum | MidNumLet) × Numeric
-// WB12.  Numeric × (MidNum | MidNumLet) Numeric
+//        WB12.  Numeric × (MidNum | MidNumLet) Numeric
-// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 //
 {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} 
                              | {MidNumericEx} {NumericEx} 
@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
  { if (populateAttributes(NUMERIC_TYPE)) return true; }
-// WB5.   ALetter × ALetter
+// UAX#29 WB5.   ALetter × ALetter
-// WB6.   ALetter × (MidLetter | MidNumLet) ALetter
+//        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
-// WB7.   ALetter (MidLetter | MidNumLet) × ALetter
+//        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
-// WB9.   ALetter × Numeric
+//        WB9.   ALetter × Numeric
-// WB10.  Numeric × ALetter
+//        WB10.  Numeric × ALetter
-// WB13.  Katakana × Katakana
+//        WB13.  Katakana × Katakana
-// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 //
 {ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
                   | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
 //
 \p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
-// WB14.  Any ÷ Any
+// UAX#29 WB14.  Any ÷ Any
 //
 \p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
 \p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
-// WB3.   CR × LF
+// UAX#29 WB3.   CR × LF
-// WB3a.  (Newline | CR | LF) ÷
+//        WB3a.  (Newline | CR | LF) ÷
-// WB3b.  ÷ (Newline | CR | LF)
+//        WB3b.  ÷ (Newline | CR | LF)
-// WB14.  Any ÷ Any
+//        WB14.  Any ÷ Any
 //
 [^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
@ -27,7 +27,10 @@
        as of Lucene 3.1, implements the Word Break rules from the Unicode Text 
        Segmentation algorithm, as specified in 
        <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
-        URLs and email addresses are also tokenized according to the relevant RFCs.
+        Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
        <b>not</b> tokenized as single tokens, but are instead split up into 
        tokens according to the UAX#29 word break rules.
        <br/>
        <code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
        <code>StandardTokenizer</code>, 
        <code><a href="StandardFilter">StandardFilter</a></code>, 
@ -46,13 +49,11 @@
        <code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
        and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
    </li>
-    <li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>: 
+    <li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>: 
-        implements the Word Break rules from the Unicode Text Segmentation 
+        implements the Word Break rules from the Unicode Text Segmentation
-        algorithm, as specified in
+        algorithm, as specified in 
        <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
-        Unlike <code>StandardTokenizer</code>, URLs and email addresses are
+        URLs and email addresses are also tokenized according to the relevant RFCs.
        <b>not</b> tokenized as single tokens, but are instead split up into 
        tokens according to the UAX#29 word break rules.
    </li>
 </ul>
 </body>
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -58,63 +51,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
    }
  };
  /** Passes through tokens with type "<URL>" and blocks all other types. */
  private class URLFilter extends TokenFilter {
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    public URLFilter(TokenStream in) {
      super(in);
    }
    @Override
    public final boolean incrementToken() throws java.io.IOException {
      boolean isTokenAvailable = false;
      while (input.incrementToken()) {
        if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
          isTokenAvailable = true;
          break;
        }
      }
      return isTokenAvailable;
    }
  }
  /** Passes through tokens with type "<EMAIL>" and blocks all other types. */
  private class EmailFilter extends TokenFilter {
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    public EmailFilter(TokenStream in) {
      super(in);
    }
    @Override
    public final boolean incrementToken() throws java.io.IOException {
      boolean isTokenAvailable = false;
      while (input.incrementToken()) {
        if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
          isTokenAvailable = true;
          break;
        }
      }
      return isTokenAvailable;
    }
  }
  private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
      TokenFilter filter = new URLFilter(tokenizer);
      return new TokenStreamComponents(tokenizer, filter);
    }
  };
  private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      TokenFilter filter = new EmailFilter(tokenizer);
      return new TokenStreamComponents(tokenizer, filter);
    }
  };
  public void testArmenian() throws Exception {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
@ -261,138 +197,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
        new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
  }
  public void testWikiURLs() throws Exception {
    Reader reader = null;
    String luceneResourcesWikiPage;
    try {
      reader = new InputStreamReader
        (getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
      StringBuilder builder = new StringBuilder();
      char[] buffer = new char[1024];
      int numCharsRead;
      while (-1 != (numCharsRead = reader.read(buffer))) {
        builder.append(buffer, 0, numCharsRead);
      }
      luceneResourcesWikiPage = builder.toString(); 
    } finally {
      if (null != reader) {
        reader.close();
      }
    }
    assertTrue(null != luceneResourcesWikiPage 
               && luceneResourcesWikiPage.length() > 0);
    BufferedReader bufferedReader = null;
    String[] urls;
    try {
      List<String> urlList = new ArrayList<String>();
      bufferedReader = new BufferedReader(new InputStreamReader
        (getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
      String line;
      while (null != (line = bufferedReader.readLine())) {
        line = line.trim();
        if (line.length() > 0) {
          urlList.add(line);
        }
      }
      urls = urlList.toArray(new String[urlList.size()]);
    } finally {
      if (null != bufferedReader) {
        bufferedReader.close();
      }
    }
    assertTrue(null != urls && urls.length > 0);
    BaseTokenStreamTestCase.assertAnalyzesTo
      (urlAnalyzer, luceneResourcesWikiPage, urls);
  }
  public void testEmails() throws Exception {
    Reader reader = null;
    String randomTextWithEmails;
    try {
      reader = new InputStreamReader
        (getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
      StringBuilder builder = new StringBuilder();
      char[] buffer = new char[1024];
      int numCharsRead;
      while (-1 != (numCharsRead = reader.read(buffer))) {
        builder.append(buffer, 0, numCharsRead);
      }
      randomTextWithEmails = builder.toString(); 
    } finally {
      if (null != reader) {
        reader.close();
      }
    }
    assertTrue(null != randomTextWithEmails 
               && randomTextWithEmails.length() > 0);
    BufferedReader bufferedReader = null;
    String[] emails;
    try {
      List<String> emailList = new ArrayList<String>();
      bufferedReader = new BufferedReader(new InputStreamReader
        (getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
      String line;
      while (null != (line = bufferedReader.readLine())) {
        line = line.trim();
        if (line.length() > 0) {
          emailList.add(line);
        }
      }
      emails = emailList.toArray(new String[emailList.size()]);
    } finally {
      if (null != bufferedReader) {
        bufferedReader.close();
      }
    }
    assertTrue(null != emails && emails.length > 0);
    BaseTokenStreamTestCase.assertAnalyzesTo
      (emailAnalyzer, randomTextWithEmails, emails);
  }
  public void testURLs() throws Exception {
    Reader reader = null;
    String randomTextWithURLs;
    try {
      reader = new InputStreamReader
        (getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
      StringBuilder builder = new StringBuilder();
      char[] buffer = new char[1024];
      int numCharsRead;
      while (-1 != (numCharsRead = reader.read(buffer))) {
        builder.append(buffer, 0, numCharsRead);
      }
      randomTextWithURLs = builder.toString(); 
    } finally {
      if (null != reader) {
        reader.close();
      }
    }
    assertTrue(null != randomTextWithURLs 
               && randomTextWithURLs.length() > 0);
    BufferedReader bufferedReader = null;
    String[] urls;
    try {
      List<String> urlList = new ArrayList<String>();
      bufferedReader = new BufferedReader(new InputStreamReader
        (getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
      String line;
      while (null != (line = bufferedReader.readLine())) {
        line = line.trim();
        if (line.length() > 0) {
          urlList.add(line);
        }
      }
      urls = urlList.toArray(new String[urlList.size()]);
    } finally {
      if (null != bufferedReader) {
        bufferedReader.close();
      }
    }
    assertTrue(null != urls && urls.length > 0);
    BaseTokenStreamTestCase.assertAnalyzesTo
      (urlAnalyzer, randomTextWithURLs, urls);
  }
  public void testUnicodeWordBreaks() throws Exception {
    WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
    wordBreakTest.test(a);
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -28,7 +35,7 @@ import java.util.Arrays;
 * limitations under the License.
 */
-public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
+public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
  public void testHugeDoc() throws IOException {
    StringBuilder sb = new StringBuilder();
@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
-    UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
+    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
    BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
  }
@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
    protected TokenStreamComponents createComponents
      (String fieldName, Reader reader) {
-      Tokenizer tokenizer = new UAX29Tokenizer(reader);
+      Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
      return new TokenStreamComponents(tokenizer);
    }
  };
  /** Passes through tokens with type "<URL>" and blocks all other types. */
  private class URLFilter extends TokenFilter {
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    public URLFilter(TokenStream in) {
      super(in);
    }
    @Override
    public final boolean incrementToken() throws java.io.IOException {
      boolean isTokenAvailable = false;
      while (input.incrementToken()) {
        if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
          isTokenAvailable = true;
          break;
        }
      }
      return isTokenAvailable;
    }
  }
  /** Passes through tokens with type "<EMAIL>" and blocks all other types. */
  private class EmailFilter extends TokenFilter {
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    public EmailFilter(TokenStream in) {
      super(in);
    }
    @Override
    public final boolean incrementToken() throws java.io.IOException {
      boolean isTokenAvailable = false;
      while (input.incrementToken()) {
        if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
          isTokenAvailable = true;
          break;
        }
      }
      return isTokenAvailable;
    }
  }
  private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
      tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
      TokenFilter filter = new URLFilter(tokenizer);
      return new TokenStreamComponents(tokenizer, filter);
    }
  };
  private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
      TokenFilter filter = new EmailFilter(tokenizer);
      return new TokenStreamComponents(tokenizer, filter);
    }
  };
  public void testArmenian() throws Exception {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  }
  public void testTextWithNumbersSA() throws Exception {
@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
        new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
  }
  public void testWikiURLs() throws Exception {
    Reader reader = null;
    String luceneResourcesWikiPage;
    try {
      reader = new InputStreamReader(getClass().getResourceAsStream
        ("LuceneResourcesWikiPage.html"), "UTF-8");
      StringBuilder builder = new StringBuilder();
      char[] buffer = new char[1024];
      int numCharsRead;
      while (-1 != (numCharsRead = reader.read(buffer))) {
        builder.append(buffer, 0, numCharsRead);
      }
      luceneResourcesWikiPage = builder.toString(); 
    } finally {
      if (null != reader) {
        reader.close();
      }
    }
    assertTrue(null != luceneResourcesWikiPage 
               && luceneResourcesWikiPage.length() > 0);
    BufferedReader bufferedReader = null;
    String[] urls;
    try {
      List<String> urlList = new ArrayList<String>();
      bufferedReader = new BufferedReader(new InputStreamReader
        (getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
      String line;
      while (null != (line = bufferedReader.readLine())) {
        line = line.trim();
        if (line.length() > 0) {
          urlList.add(line);
        }
      }
      urls = urlList.toArray(new String[urlList.size()]);
    } finally {
      if (null != bufferedReader) {
        bufferedReader.close();
      }
    }
    assertTrue(null != urls && urls.length > 0);
    BaseTokenStreamTestCase.assertAnalyzesTo
      (urlAnalyzer, luceneResourcesWikiPage, urls);
  }
  public void testEmails() throws Exception {
    Reader reader = null;
    String randomTextWithEmails;
    try {
      reader = new InputStreamReader(getClass().getResourceAsStream
        ("random.text.with.email.addresses.txt"), "UTF-8");
      StringBuilder builder = new StringBuilder();
      char[] buffer = new char[1024];
      int numCharsRead;
      while (-1 != (numCharsRead = reader.read(buffer))) {
        builder.append(buffer, 0, numCharsRead);
      }
      randomTextWithEmails = builder.toString(); 
    } finally {
      if (null != reader) {
        reader.close();
      }
    }
    assertTrue(null != randomTextWithEmails 
               && randomTextWithEmails.length() > 0);
    BufferedReader bufferedReader = null;
    String[] emails;
    try {
      List<String> emailList = new ArrayList<String>();
      bufferedReader = new BufferedReader(new InputStreamReader
        (getClass().getResourceAsStream
          ("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
      String line;
      while (null != (line = bufferedReader.readLine())) {
        line = line.trim();
        if (line.length() > 0) {
          emailList.add(line);
        }
      }
      emails = emailList.toArray(new String[emailList.size()]);
    } finally {
      if (null != bufferedReader) {
        bufferedReader.close();
      }
    }
    assertTrue(null != emails && emails.length > 0);
    BaseTokenStreamTestCase.assertAnalyzesTo
      (emailAnalyzer, randomTextWithEmails, emails);
  }
  public void testURLs() throws Exception {
    Reader reader = null;
    String randomTextWithURLs;
    try {
      reader = new InputStreamReader(getClass().getResourceAsStream
        ("random.text.with.urls.txt"), "UTF-8");
      StringBuilder builder = new StringBuilder();
      char[] buffer = new char[1024];
      int numCharsRead;
      while (-1 != (numCharsRead = reader.read(buffer))) {
        builder.append(buffer, 0, numCharsRead);
      }
      randomTextWithURLs = builder.toString(); 
    } finally {
      if (null != reader) {
        reader.close();
      }
    }
    assertTrue(null != randomTextWithURLs 
               && randomTextWithURLs.length() > 0);
    BufferedReader bufferedReader = null;
    String[] urls;
    try {
      List<String> urlList = new ArrayList<String>();
      bufferedReader = new BufferedReader(new InputStreamReader
        (getClass().getResourceAsStream
          ("urls.from.random.text.with.urls.txt"), "UTF-8"));
      String line;
      while (null != (line = bufferedReader.readLine())) {
        line = line.trim();
        if (line.length() > 0) {
          urlList.add(line);
        }
      }
      urls = urlList.toArray(new String[urlList.size()]);
    } finally {
      if (null != bufferedReader) {
        bufferedReader.close();
      }
    }
    assertTrue(null != urls && urls.length > 0);
    BaseTokenStreamTestCase.assertAnalyzesTo
      (urlAnalyzer, randomTextWithURLs, urls);
  }
  public void testUnicodeWordBreaks() throws Exception {
    WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
    wordBreakTest.test(a);
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
      assertAnalyzesToReuse(
          analyzer,
          "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
-          new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
+          new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
 	}
 	/** @deprecated (3.1) for version back compat */
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -302,8 +302,10 @@ New Features
 * SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) 
  tokenizer and filters to contrib/analysis-extras (rmuir)
-* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm 
+* SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
-  with good results for most languages.  (Tom Burton-West via rmuir)
+  UAX#29, a unicode algorithm with good results for most languages, as well as
  URL and E-mail tokenization according to the relevant RFCs.
  (Tom Burton-West via rmuir)
 * SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
--- a/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
-import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import java.io.Reader;
 import java.util.Map;
@ -30,14 +30,14 @@ import java.util.Map;
 * 
 */
-public class UAX29TokenizerFactory extends BaseTokenizerFactory {
+public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
  @Override
  public void init(Map<String,String> args) {
    super.init(args);
    assureMatchVersion();
  }
-  public UAX29Tokenizer create(Reader input) {
+  public UAX29URLEmailTokenizer create(Reader input) {
-    return new UAX29Tokenizer(input);
+    return new UAX29URLEmailTokenizer(input);
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
@ -1,81 +0,0 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.Tokenizer;
 /**
 * A few tests based on  org.apache.lucene.analysis.TestUAX29Tokenizer;
 */
 public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
  /**
   * Test UAX29TokenizerFactory
   */
  public void testUAX29Tokenizer() throws Exception {
    Reader reader = new StringReader("Wha\u0301t's this thing do?");
    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"Wha\u0301t's", "this", "thing", "do" });
  }
  public void testArabic() throws Exception {
    Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
  }
  public void testChinese() throws Exception {
    Reader reader = new StringReader("我是中国人。 １２３４ Ｔｅｓｔｓ ");
    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"我", "是", "中", "国", "人", "１２３４", "Ｔｅｓｔｓ"});
  }
  public void testKorean() throws Exception {
    Reader reader = new StringReader("안녕하세요 한글입니다");
    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"안녕하세요", "한글입니다"});
  }
  public void testHyphen() throws Exception {
    Reader reader = new StringReader("some-dashed-phrase");
    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"some", "dashed", "phrase"});
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
@ -0,0 +1,155 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.Tokenizer;
 /**
 * A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
 */
 public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
  public void testUAX29URLEmailTokenizer() throws Exception {
    Reader reader = new StringReader("Wha\u0301t's this thing do?");
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"Wha\u0301t's", "this", "thing", "do" });
  }
  public void testArabic() throws Exception {
    Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
  }
  public void testChinese() throws Exception {
    Reader reader = new StringReader("我是中国人。 １２３４ Ｔｅｓｔｓ ");
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"我", "是", "中", "国", "人", "１２３４", "Ｔｅｓｔｓ"});
  }
  public void testKorean() throws Exception {
    Reader reader = new StringReader("안녕하세요 한글입니다");
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"안녕하세요", "한글입니다"});
  }
  public void testHyphen() throws Exception {
    Reader reader = new StringReader("some-dashed-phrase");
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] {"some", "dashed", "phrase"});
  }
  // Test with some URLs from TestUAX29URLEmailTokenizer's 
  // urls.from.random.text.with.urls.txt
  public void testURLs() throws Exception {
    String textWithURLs 
      = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on\n"
        + " some extra\nWords thrown in here. "
        + "http://c5-3486.bisynxu.FR/aI.YnNms/"
        + " samba Halta gamba "
        + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
        + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
        + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
        + " inter Locutio "
        + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
        + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
        + " blah Sirrah woof "
        + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
    Reader reader = new StringReader(textWithURLs);
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] { 
          "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on",
          "some", "extra", "Words", "thrown", "in", "here",
          "http://c5-3486.bisynxu.FR/aI.YnNms/",
          "samba", "Halta", "gamba",
          "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
          "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
          "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
          "inter", "Locutio",
          "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
          "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
          "blah", "Sirrah", "woof",
          "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
        }
    );
  }
  // Test with some emails from TestUAX29URLEmailTokenizer's 
  // email.addresses.from.random.text.with.email.addresses.txt
  public void testEmails() throws Exception {
    String textWithEmails 
      =  " some extra\nWords thrown in here. "
         + "dJ8ngFi@avz13m.CC\n"
         + "kU-l6DS@[082.015.228.189]\n"
         + "\"%U\u0012@?\\B\"@Fl2d.md"
         + " samba Halta gamba "
         + "Bvd#@tupjv.sn\n"
         + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
         + "~+Kdz@3mousnl.SE\n"
         + " inter Locutio "
         + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
         + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
         + " blah Sirrah woof "
         + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
         + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
    Reader reader = new StringReader(textWithEmails);
    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    Tokenizer stream = factory.create(reader);
    assertTokenStreamContents(stream, 
        new String[] { 
          "some", "extra", "Words", "thrown", "in", "here",
          "dJ8ngFi@avz13m.CC",
          "kU-l6DS@[082.015.228.189]",
          "\"%U\u0012@?\\B\"@Fl2d.md",
          "samba", "Halta", "gamba",
          "Bvd#@tupjv.sn",
          "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
          "~+Kdz@3mousnl.SE",
          "inter", "Locutio",
          "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
          "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
          "blah", "Sirrah", "woof",
          "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
          "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
        }
    );
  }
 }