LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043071 13f79535-47bb-0310-9956-ffa450edef68
2010-12-07 14:53:13 +00:00 · 2010-12-07 14:53:13 +00:00 · 2b9726ae81
parent 5b2e0f786b
commit 2b9726ae81
19 changed files with 3560 additions and 3461 deletions
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@ -9,14 +9,16 @@ API Changes

 * LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous.  (Robert Muir)

- * LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
-   the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
-   as well as tokenizing URLs and email addresses according to the relevant
-   RFCs.  ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
-   behavior.  (Steven Rowe, Robert Muir, Uwe Schindler)
+ * LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in 
+   common/standard/ now implement the Word Break rules from the Unicode 6.0.0
+   Text Segmentation algorithm (UAX#29).  
   
- * LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
-   (Steven Rowe)
+   ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
+   implementation and behavior.
+
+   UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
+   relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
+   (Steven Rowe, Robert Muir, Uwe Schindler)

 * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
   can be generated. (Chris Harris via Steven Rowe)
--- a/modules/analysis/common/build.xml
+++ b/modules/analysis/common/build.xml
@ -38,7 +38,7 @@

  <target name="compile-core" depends="jflex-notice, common.compile-core"/>

-  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
+  <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>

  <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@ -62,11 +62,11 @@
           nobak="on" />
  </target>

-  <target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
+  <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
 			<classpath refid="jflex.classpath"/>
    </taskdef>
-    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
           outdir="src/java/org/apache/lucene/analysis/standard"
           nobak="on" />
  </target>
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
@ -15,8 +15,8 @@
 */

 // Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
-// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
-// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
+// file version from Saturday, December 4, 2010 12:34:19 PM UTC
+// generated on Sunday, December 5, 2010 12:24:12 AM UTC
 // by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros

 ASCIITLD = "." (
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */

 package org.apache.lucene.analysis.standard;

@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * on 12/4/10 7:24 PM from the specification file
+ * <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
 */
 class ClassicTokenizerImpl implements StandardTokenizerInterface {

@ -630,6 +630,12 @@ public final void getText(CharTermAttribute t) {
  
      zzState = ZZ_LEXSTATE[zzLexicalState];

+      // set up zzAction for empty match case:
+      int zzAttributes = zzAttrL[zzState];
+      if ( (zzAttributes & 1) == 1 ) {
+        zzAction = zzState;
+      }
+

      zzForAction: {
        while (true) {
@ -662,7 +668,7 @@ public final void getText(CharTermAttribute t) {
          if (zzNext == -1) break zzForAction;
          zzState = zzNext;

-          int zzAttributes = zzAttrL[zzState];
+          zzAttributes = zzAttrL[zzState];
          if ( (zzAttributes & 1) == 1 ) {
            zzAction = zzState;
            zzMarkedPosL = zzCurrentPosL;
@ -676,45 +682,45 @@ public final void getText(CharTermAttribute t) {
      zzMarkedPos = zzMarkedPosL;

      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 10: 
-          { return EMAIL;
-          }
-        case 11: break;
-        case 2: 
-          { return ALPHANUM;
-          }
-        case 12: break;
-        case 4: 
-          { return HOST;
-          }
-        case 13: break;
-        case 1: 
-          { /* ignore */
-          }
-        case 14: break;
-        case 8: 
-          { return ACRONYM_DEP;
-          }
-        case 15: break;
        case 5: 
          { return NUM;
          }
-        case 16: break;
+        case 11: break;
        case 9: 
          { return ACRONYM;
          }
-        case 17: break;
+        case 12: break;
        case 7: 
          { return COMPANY;
          }
-        case 18: break;
+        case 13: break;
+        case 10: 
+          { return EMAIL;
+          }
+        case 14: break;
+        case 1: 
+          { /* ignore */
+          }
+        case 15: break;
        case 6: 
          { return APOSTROPHE;
          }
-        case 19: break;
+        case 16: break;
        case 3: 
          { return CJ;
          }
+        case 17: break;
+        case 8: 
+          { return ACRONYM_DEP;
+          }
+        case 18: break;
+        case 2: 
+          { return ALPHANUM;
+          }
+        case 19: break;
+        case 4: 
+          { return HOST;
+          }
        case 20: break;
        default: 
          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt
@ -16,6 +16,6 @@
 */


-WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
-      the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
-      SVN revision 597) at the moment!
+WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
+      and need to regenerate the tokenizer, only use the trunk version
+      of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -83,10 +83,9 @@ public final class StandardTokenizer extends Tokenizer {
  @Deprecated
  public static final int ACRONYM_DEP       = 8;

-  public static final int URL = 9;
-  public static final int SOUTHEAST_ASIAN = 10;
-  public static final int IDEOGRAPHIC = 11;
-  public static final int HIRAGANA = 12;
+  public static final int SOUTHEAST_ASIAN = 9;
+  public static final int IDEOGRAPHIC = 10;
+  public static final int HIRAGANA = 11;
  
  /** String token types that correspond to token type int constants */
  public static final String [] TOKEN_TYPES = new String [] {
@ -99,7 +98,6 @@ public final class StandardTokenizer extends Tokenizer {
    "<NUM>",
    "<CJ>",
    "<ACRONYM_DEP>",
-    "<URL>",
    "<SOUTHEAST_ASIAN>",
    "<IDEOGRAPHIC>",
    "<HIRAGANA>"
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@ -23,14 +23,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 * This class implements Word Break rules from the Unicode Text Segmentation 
 * algorithm, as specified in 
 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
- * URLs and email addresses are also tokenized according to the relevant RFCs.
 * <p/>
 * Tokens produced are of the following types:
 * <ul>
 *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
 *   <li>&lt;NUM&gt;: A number</li>
- *   <li>&lt;URL&gt;: A URL</li>
- *   <li>&lt;EMAIL&gt;: An email address</li>
 *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
 *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
 *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -67,83 +64,6 @@ MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]
 ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*


-// URL and E-mail syntax specifications:
-//
-//     RFC-952:  DOD INTERNET HOST TABLE SPECIFICATION
-//     RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
-//     RFC-1123: Requirements for Internet Hosts - Application and Support
-//     RFC-1738: Uniform Resource Locators (URL)
-//     RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
-//     RFC-5234: Augmented BNF for Syntax Specifications: ABNF
-//     RFC-5321: Simple Mail Transfer Protocol
-//     RFC-5322: Internet Message Format
-
-%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
-
-DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
-DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
-DomainNameLoose  = {DomainLabel} ("." {DomainLabel})*
-
-IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
-IPv4Address  = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3} 
-IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
-IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
-IPv6Address =                                                  ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
-            |                                             "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
-            |                            {IPv6Hex16Bit}?  "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
-            | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
-            | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
-            | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::"  {IPv6Hex16Bit} ":"     {IPv6LeastSignificant32Bits}
-            | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::"                         {IPv6LeastSignificant32Bits}
-            | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::"                         {IPv6Hex16Bit}
-            | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
-
-URIunreserved = [-._~A-Za-z0-9]
-URIpercentEncoded = "%" [0-9A-Fa-f]{2}
-URIsubDelims = [!$&'()*+,;=]
-URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
-URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
-URIquery    = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
-URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
-URIport = ":" [0-9]{1,5}
-URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}  
-URIhostLoose  = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose} 
-
-URIauthorityStrict =             {URIhostStrict} {URIport}?
-URIauthorityLoose  = {URIlogin}? {URIhostLoose}  {URIport}?
-
-HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
-HTTPpath = ("/" {HTTPsegment})*
-HTTPscheme = [hH][tT][tT][pP][sS]? "://"
-HTTPurlFull = {HTTPscheme} {URIauthorityLoose}  {HTTPpath}? {URIquery}? {URIfragment}?
-// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
-HTTPurlNoScheme =          {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
-HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
-
-FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
-FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
-FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
-FTPscheme = [fF][tT][pP] "://"
-FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
-
-FILEscheme = [fF][iI][lL][eE] "://"
-FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
-
-URL = {HTTPurl} | {FTPurl} | {FILEurl}
-
-EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
-EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
-EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
-EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
-EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
-// DFA minimization allows {IPv6Address} and {IPv4Address} to be included 
-// in the {EMAILbracketedHost} definition without incurring any size penalties, 
-// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
-// The IP address regexes are included in {EMAILbracketedHost} simply as a 
-// reminder that they are acceptable bracketed host forms.
-EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
-EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
-
 %{
  /** Alphanumeric sequences */
  public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
@ -151,12 +71,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
  /** Numbers */
  public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
  
-  /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
-  public static final int URL_TYPE = StandardTokenizer.URL;
-  
-  /** E-mail addresses */
-  public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
-  
  /**
   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
@ -191,9 +105,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 //
 <<EOF>> { return StandardTokenizerInterface.YYEOF; }

-{URL}   { return URL_TYPE; }
-{EMAIL} { return EMAIL_TYPE; }
-
 // UAX#29 WB8.   Numeric × Numeric
 //        WB11.  Numeric (MidNum | MidNumLet) × Numeric
 //        WB12.  Numeric × (MidNum | MidNumLet) Numeric
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
@ -1,847 +0,0 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
-
-package org.apache.lucene.analysis.standard;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
-
-
-/**
- * This class implements Word Break rules from the Unicode Text Segmentation 
- * algorithm, as specified in 
- * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
- * <p/>
- * Tokens produced are of the following types:
- * <ul>
- *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
- *   <li>&lt;NUM&gt;: A number</li>
- *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
- *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
- *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
- *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
- * </ul>
- * <b>WARNING</b>: Because JFlex does not support Unicode supplementary 
- * characters (characters above the Basic Multilingual Plane, which contains
- * those up to and including U+FFFF), this scanner will not recognize them
- * properly.  If you need to be able to process text containing supplementary 
- * characters, consider using the ICU4J-backed implementation in modules/analysis/icu  
- * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
- * instead of this class, since the ICU4J-backed implementation does not have
- * this limitation.
- */
-
-public final class UAX29Tokenizer extends Tokenizer {
-
-  /** This character denotes the end of file */
-  private static final int YYEOF = -1;
-
-  /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 16384;
-
-  /** lexical states */
-  private static final int YYINITIAL = 0;
-
-  /**
-   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
-   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
-   *                  at the beginning of a line
-   * l is of the form l = 2*k, k a non negative integer
-   */
-  private static final int ZZ_LEXSTATE[] = { 
-     0, 0
-  };
-
-  /** 
-   * Translates characters to character classes
-   */
-  private static final String ZZ_CMAP_PACKED = 
-    "\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
-    "\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
-    "\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
-    "\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
-    "\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
-    "\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
-    "\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
-    "\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
-    "\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
-    "\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
-    "\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
-    "\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
-    "\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
-    "\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
-    "\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
-    "\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
-    "\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
-    "\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
-    "\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
-    "\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
-    "\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
-    "\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
-    "\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
-    "\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
-    "\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
-    "\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
-    "\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
-    "\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
-    "\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
-    "\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
-    "\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
-    "\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
-    "\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
-    "\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
-    "\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
-    "\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
-    "\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
-    "\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
-    "\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
-    "\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
-    "\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
-    "\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
-    "\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
-    "\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
-    "\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
-    "\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
-    "\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
-    "\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
-    "\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
-    "\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
-    "\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
-    "\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
-    "\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
-    "\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
-    "\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
-    "\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
-    "\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
-    "\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
-    "\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
-    "\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
-    "\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
-    "\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
-    "\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
-    "\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
-    "\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
-    "\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
-    "\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
-    "\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
-    "\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
-    "\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
-    "\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
-    "\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
-    "\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
-    "\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
-    "\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
-    "\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
-    "\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
-    "\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
-    "\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
-    "\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
-    "\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
-    "\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
-    "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
-    "\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
-    "\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
-    "\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
-    "\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
-    "\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
-    "\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
-    "\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
-    "\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
-    "\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
-    "\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
-    "\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
-    "\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
-    "\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
-    "\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
-    "\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
-    "\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
-    "\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
-    "\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
-    "\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
-    "\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
-    "\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
-    "\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
-    "\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
-    "\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
-    "\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
-    "\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
-
-  /** 
-   * Translates characters to character classes
-   */
-  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
-
-  /** 
-   * Translates DFA states to action switch labels.
-   */
-  private static final int [] ZZ_ACTION = zzUnpackAction();
-
-  private static final String ZZ_ACTION_PACKED_0 =
-    "\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
-    "\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
-
-  private static int [] zzUnpackAction() {
-    int [] result = new int[16];
-    int offset = 0;
-    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
-    return result;
-  }
-
-  private static int zzUnpackAction(String packed, int offset, int [] result) {
-    int i = 0;       /* index in packed string  */
-    int j = offset;  /* index in unpacked array */
-    int l = packed.length();
-    while (i < l) {
-      int count = packed.charAt(i++);
-      int value = packed.charAt(i++);
-      do result[j++] = value; while (--count > 0);
-    }
-    return j;
-  }
-
-
-  /** 
-   * Translates a state to a row index in the transition table
-   */
-  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
-
-  private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
-    "\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
-
-  private static int [] zzUnpackRowMap() {
-    int [] result = new int[16];
-    int offset = 0;
-    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
-    return result;
-  }
-
-  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
-    int i = 0;  /* index in packed string  */
-    int j = offset;  /* index in unpacked array */
-    int l = packed.length();
-    while (i < l) {
-      int high = packed.charAt(i++) << 16;
-      result[j++] = high | packed.charAt(i++);
-    }
-    return j;
-  }
-
-  /** 
-   * The transition table of the DFA
-   */
-  private static final int [] ZZ_TRANS = zzUnpackTrans();
-
-  private static final String ZZ_TRANS_PACKED_0 =
-    "\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
-    "\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
-    "\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
-    "\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
-    "\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
-    "\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
-    "\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
-    "\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
-    "\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
-    "\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
-    "\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
-    "\2\0";
-
-  private static int [] zzUnpackTrans() {
-    int [] result = new int[169];
-    int offset = 0;
-    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
-    return result;
-  }
-
-  private static int zzUnpackTrans(String packed, int offset, int [] result) {
-    int i = 0;       /* index in packed string  */
-    int j = offset;  /* index in unpacked array */
-    int l = packed.length();
-    while (i < l) {
-      int count = packed.charAt(i++);
-      int value = packed.charAt(i++);
-      value--;
-      do result[j++] = value; while (--count > 0);
-    }
-    return j;
-  }
-
-
-  /* error codes */
-  private static final int ZZ_UNKNOWN_ERROR = 0;
-  private static final int ZZ_NO_MATCH = 1;
-  private static final int ZZ_PUSHBACK_2BIG = 2;
-
-  /* error messages for the codes above */
-  private static final String ZZ_ERROR_MSG[] = {
-    "Unkown internal scanner error",
-    "Error: could not match input",
-    "Error: pushback value was too large"
-  };
-
-  /**
-   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
-   */
-  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
-
-  private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
-    "\1\1\2\0";
-
-  private static int [] zzUnpackAttribute() {
-    int [] result = new int[16];
-    int offset = 0;
-    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
-    return result;
-  }
-
-  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
-    int i = 0;       /* index in packed string  */
-    int j = offset;  /* index in unpacked array */
-    int l = packed.length();
-    while (i < l) {
-      int count = packed.charAt(i++);
-      int value = packed.charAt(i++);
-      do result[j++] = value; while (--count > 0);
-    }
-    return j;
-  }
-
-  /** the input device */
-  private java.io.Reader zzReader;
-
-  /** the current state of the DFA */
-  private int zzState;
-
-  /** the current lexical state */
-  private int zzLexicalState = YYINITIAL;
-
-  /** this buffer contains the current text to be matched and is
-      the source of the yytext() string */
-  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
-
-  /** the textposition at the last accepting state */
-  private int zzMarkedPos;
-
-  /** the current text position in the buffer */
-  private int zzCurrentPos;
-
-  /** startRead marks the beginning of the yytext() string in the buffer */
-  private int zzStartRead;
-
-  /** endRead marks the last character in the buffer, that has been read
-      from input */
-  private int zzEndRead;
-
-  /** number of newlines encountered up to the start of the matched text */
-  private int yyline;
-
-  /** the number of characters up to the start of the matched text */
-  private int yychar;
-
-  /**
-   * the number of characters from the last newline up to the start of the 
-   * matched text
-   */
-  private int yycolumn;
-
-  /** 
-   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
-   */
-  private boolean zzAtBOL = true;
-
-  /** zzAtEOF == true <=> the scanner is at the EOF */
-  private boolean zzAtEOF;
-
-  /** denotes if the user-EOF-code has already been executed */
-  private boolean zzEOFDone;
-
-  /* user code: */
-  /** Alphanumeric sequences */
-  public static final String WORD_TYPE = "<ALPHANUM>";
-  
-  /** Numbers */
-  public static final String NUMERIC_TYPE = "<NUM>";
-  
-  /**
-   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
-   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
-   * together as as a single token rather than broken up, because the logic
-   * required to break them at word boundaries is too complex for UAX#29.
-   * <p>
-   * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
-   */
-  public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
-  
-  public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
-  
-  public static final String HIRAGANA_TYPE = "<HIRAGANA>";
-  
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posIncrAtt 
-    = addAttribute(PositionIncrementAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  
-  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
-  private int posIncr;
-
-  
-  /**
-   * @param source The AttributeSource to use
-   * @param input The input reader
-   */
-  public UAX29Tokenizer(AttributeSource source, Reader input) {
-    super(source, input);
-    zzReader = input;
-  }
-  
-  /**
-   * @param factory The AttributeFactory to use
-   * @param input The input reader
-   */
-  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
-    super(factory, input); 
-    zzReader = input;
-  }
-  
-  /** 
-   * Set the max allowed token length.  Any token longer than this is skipped.
-   * @param length the new max allowed token length
-   */
-  public void setMaxTokenLength(int length) {
-    this.maxTokenLength = length;
-  }
-
-  /**
-   * Returns the max allowed token length.  Any token longer than this is 
-   * skipped.
-   * @return the max allowed token length 
-   */
-  public int getMaxTokenLength() {
-    return maxTokenLength;
-  }
-
-  @Override
-  public final void end() {
-    // set final offset
-    int finalOffset = correctOffset(yychar + yylength());
-    offsetAtt.setOffset(finalOffset, finalOffset);
-  }
-
-  @Override
-  public void reset(Reader reader) throws IOException {
-    super.reset(reader);
-    yyreset(reader);
-  }
-
-  @Override
-  public final boolean incrementToken() throws IOException {
-    // This method is required because of two JFlex limitations:
-    // 1. No way to insert code at the beginning of the generated scanning
-    //    get-next-token method; and
-    // 2. No way to declare @Override on the generated scanning method.
-    clearAttributes();
-    posIncr = 1;
-    return getNextToken();
-  }
-
-  /**
-   * Populates this TokenStream's CharTermAttribute and OffsetAttribute from
-   * the current match, the TypeAttribute from the passed-in tokenType, and
-   * the PositionIncrementAttribute to one, unless the immediately previous
-   * token(s) was/were skipped because maxTokenLength was exceeded, in which
-   * case the PositionIncrementAttribute is set to one plus the number of
-   * skipped overly long tokens. 
-   * <p/> 
-   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
-   * and false is returned.
-   * 
-   * @param tokenType The type of the matching token
-   * @return true there is a token available (not too long); false otherwise 
-   */
-  private boolean populateAttributes(String tokenType) {
-    boolean isTokenAvailable = false;
-    if (yylength() > maxTokenLength) {
-      // When we skip a too-long token, we treat it like a stopword, introducing
-      // a position increment gap
-      ++posIncr;
-    } else {
-      termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
-      posIncrAtt.setPositionIncrement(posIncr);
-      offsetAtt.setOffset(correctOffset(yychar),
-                          correctOffset(yychar + yylength()));
-      typeAtt.setType(tokenType);
-      isTokenAvailable = true;
-    }
-    return isTokenAvailable;
-  }
-
-
-  /**
-   * Creates a new scanner
-   * There is also a java.io.InputStream version of this constructor.
-   *
-   * @param   in  the java.io.Reader to read input from.
-   */
-  public UAX29Tokenizer(java.io.Reader in) {
-    super(in);
-    this.zzReader = in;
-  }
-
-  /**
-   * Creates a new scanner.
-   * There is also java.io.Reader version of this constructor.
-   *
-   * @param   in  the java.io.Inputstream to read input from.
-   */
-  public UAX29Tokenizer(java.io.InputStream in) {
-    this(new java.io.InputStreamReader(in));
-  }
-
-  /** 
-   * Unpacks the compressed character translation table.
-   *
-   * @param packed   the packed character translation table
-   * @return         the unpacked character translation table
-   */
-  private static char [] zzUnpackCMap(String packed) {
-    char [] map = new char[0x10000];
-    int i = 0;  /* index in packed string  */
-    int j = 0;  /* index in unpacked array */
-    while (i < 2174) {
-      int  count = packed.charAt(i++);
-      char value = packed.charAt(i++);
-      do map[j++] = value; while (--count > 0);
-    }
-    return map;
-  }
-
-
-  /**
-   * Refills the input buffer.
-   *
-   * @return      <code>false</code>, iff there was new input.
-   * 
-   * @exception   java.io.IOException  if any I/O-Error occurs
-   */
-  private boolean zzRefill() throws java.io.IOException {
-
-    /* first: make room (if you can) */
-    if (zzStartRead > 0) {
-      System.arraycopy(zzBuffer, zzStartRead,
-                       zzBuffer, 0,
-                       zzEndRead-zzStartRead);
-
-      /* translate stored positions */
-      zzEndRead-= zzStartRead;
-      zzCurrentPos-= zzStartRead;
-      zzMarkedPos-= zzStartRead;
-      zzStartRead = 0;
-    }
-
-    /* is the buffer big enough? */
-    if (zzCurrentPos >= zzBuffer.length) {
-      /* if not: blow it up */
-      char newBuffer[] = new char[zzCurrentPos*2];
-      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
-      zzBuffer = newBuffer;
-    }
-
-    /* finally: fill the buffer with new input */
-    int numRead = zzReader.read(zzBuffer, zzEndRead,
-                                            zzBuffer.length-zzEndRead);
-
-    if (numRead > 0) {
-      zzEndRead+= numRead;
-      return false;
-    }
-    // unlikely but not impossible: read 0 characters, but not at end of stream    
-    if (numRead == 0) {
-      int c = zzReader.read();
-      if (c == -1) {
-        return true;
-      } else {
-        zzBuffer[zzEndRead++] = (char) c;
-        return false;
-      }     
-    }
-
-	// numRead < 0
-    return true;
-  }
-
-    
-  /**
-   * Closes the input stream.
-   */
-  private final void yyclose() throws java.io.IOException {
-    zzAtEOF = true;            /* indicate end of file */
-    zzEndRead = zzStartRead;  /* invalidate buffer    */
-
-    if (zzReader != null)
-      zzReader.close();
-  }
-
-
-  /**
-   * Resets the scanner to read from a new input stream.
-   * Does not close the old reader.
-   *
-   * All internal variables are reset, the old input stream 
-   * <b>cannot</b> be reused (internal buffer is discarded and lost).
-   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
-   *
-   * Internal scan buffer is resized down to its initial length, if it has grown.
-   *
-   * @param reader   the new input stream 
-   */
-  private final void yyreset(java.io.Reader reader) {
-    zzReader = reader;
-    zzAtBOL  = true;
-    zzAtEOF  = false;
-    zzEOFDone = false;
-    zzEndRead = zzStartRead = 0;
-    zzCurrentPos = zzMarkedPos = 0;
-    yyline = yychar = yycolumn = 0;
-    zzLexicalState = YYINITIAL;
-    if (zzBuffer.length > ZZ_BUFFERSIZE)
-      zzBuffer = new char[ZZ_BUFFERSIZE];
-  }
-
-
-  /**
-   * Returns the current lexical state.
-   */
-  private final int yystate() {
-    return zzLexicalState;
-  }
-
-
-  /**
-   * Enters a new lexical state
-   *
-   * @param newState the new lexical state
-   */
-  private final void yybegin(int newState) {
-    zzLexicalState = newState;
-  }
-
-
-  /**
-   * Returns the text matched by the current regular expression.
-   */
-  private final String yytext() {
-    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
-  }
-
-
-  /**
-   * Returns the character at position <tt>pos</tt> from the 
-   * matched text. 
-   * 
-   * It is equivalent to yytext().charAt(pos), but faster
-   *
-   * @param pos the position of the character to fetch. 
-   *            A value from 0 to yylength()-1.
-   *
-   * @return the character at position pos
-   */
-  private final char yycharat(int pos) {
-    return zzBuffer[zzStartRead+pos];
-  }
-
-
-  /**
-   * Returns the length of the matched text region.
-   */
-  private final int yylength() {
-    return zzMarkedPos-zzStartRead;
-  }
-
-
-  /**
-   * Reports an error that occured while scanning.
-   *
-   * In a wellformed scanner (no or only correct usage of 
-   * yypushback(int) and a match-all fallback rule) this method 
-   * will only be called with things that "Can't Possibly Happen".
-   * If this method is called, something is seriously wrong
-   * (e.g. a JFlex bug producing a faulty scanner etc.).
-   *
-   * Usual syntax/scanner level error handling should be done
-   * in error fallback rules.
-   *
-   * @param   errorCode  the code of the errormessage to display
-   */
-  private void zzScanError(int errorCode) {
-    String message;
-    try {
-      message = ZZ_ERROR_MSG[errorCode];
-    }
-    catch (ArrayIndexOutOfBoundsException e) {
-      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
-    }
-
-    throw new Error(message);
-  } 
-
-
-  /**
-   * Pushes the specified amount of characters back into the input stream.
-   *
-   * They will be read again by then next call of the scanning method
-   *
-   * @param number  the number of characters to be read again.
-   *                This number must not be greater than yylength()!
-   */
-  private void yypushback(int number)  {
-    if ( number > yylength() )
-      zzScanError(ZZ_PUSHBACK_2BIG);
-
-    zzMarkedPos -= number;
-  }
-
-
-  /**
-   * Resumes scanning until the next regular expression is matched,
-   * the end of input is encountered or an I/O-Error occurs.
-   *
-   * @return      the next token
-   * @exception   java.io.IOException  if any I/O-Error occurs
-   */
-  private boolean getNextToken() throws java.io.IOException {
-    int zzInput;
-    int zzAction;
-
-    // cached fields:
-    int zzCurrentPosL;
-    int zzMarkedPosL;
-    int zzEndReadL = zzEndRead;
-    char [] zzBufferL = zzBuffer;
-    char [] zzCMapL = ZZ_CMAP;
-
-    int [] zzTransL = ZZ_TRANS;
-    int [] zzRowMapL = ZZ_ROWMAP;
-    int [] zzAttrL = ZZ_ATTRIBUTE;
-
-    while (true) {
-      zzMarkedPosL = zzMarkedPos;
-
-      yychar+= zzMarkedPosL-zzStartRead;
-
-      zzAction = -1;
-
-      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
-  
-      zzState = ZZ_LEXSTATE[zzLexicalState];
-
-
-      zzForAction: {
-        while (true) {
-    
-          if (zzCurrentPosL < zzEndReadL)
-            zzInput = zzBufferL[zzCurrentPosL++];
-          else if (zzAtEOF) {
-            zzInput = YYEOF;
-            break zzForAction;
-          }
-          else {
-            // store back cached positions
-            zzCurrentPos  = zzCurrentPosL;
-            zzMarkedPos   = zzMarkedPosL;
-            boolean eof = zzRefill();
-            // get translated positions and possibly new buffer
-            zzCurrentPosL  = zzCurrentPos;
-            zzMarkedPosL   = zzMarkedPos;
-            zzBufferL      = zzBuffer;
-            zzEndReadL     = zzEndRead;
-            if (eof) {
-              zzInput = YYEOF;
-              break zzForAction;
-            }
-            else {
-              zzInput = zzBufferL[zzCurrentPosL++];
-            }
-          }
-          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
-          if (zzNext == -1) break zzForAction;
-          zzState = zzNext;
-
-          int zzAttributes = zzAttrL[zzState];
-          if ( (zzAttributes & 1) == 1 ) {
-            zzAction = zzState;
-            zzMarkedPosL = zzCurrentPosL;
-            if ( (zzAttributes & 8) == 8 ) break zzForAction;
-          }
-
-        }
-      }
-
-      // store back cached position
-      zzMarkedPos = zzMarkedPosL;
-
-      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 5: 
-          { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
-          }
-        case 7: break;
-        case 1: 
-          { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
-          }
-        case 8: break;
-        case 3: 
-          { if (populateAttributes(NUMERIC_TYPE)) return true;
-          }
-        case 9: break;
-        case 6: 
-          { if (populateAttributes(HIRAGANA_TYPE)) return true;
-          }
-        case 10: break;
-        case 4: 
-          { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
-          }
-        case 11: break;
-        case 2: 
-          { if (populateAttributes(WORD_TYPE)) return true;
-          }
-        case 12: break;
-        default: 
-          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
-            zzAtEOF = true;
-              {
-                return false;
-              }
-          } 
-          else {
-            zzScanError(ZZ_NO_MATCH);
-          }
-      }
-    }
-  }
-
-
-}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeSource;
 * This class implements Word Break rules from the Unicode Text Segmentation 
 * algorithm, as specified in 
 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> 
+ * URLs and email addresses are also tokenized according to the relevant RFCs.
 * <p/>
 * Tokens produced are of the following types:
 * <ul>
 *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
 *   <li>&lt;NUM&gt;: A number</li>
+ *   <li>&lt;URL&gt;: A URL</li>
+ *   <li>&lt;EMAIL&gt;: An email address</li>
 *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
 *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
 *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeSource;
 %final
 %public
 %apiprivate
-%class UAX29Tokenizer
+%class UAX29URLEmailTokenizer
 %extends Tokenizer
 %type boolean
 %function getNextToken
@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeSource;
  super(in);
 %init}

-// WB4. X (Extend | Format)* --> X
+// UAX#29 WB4. X (Extend | Format)* --> X
 //
 ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
@ -77,6 +80,85 @@ MidLetterEx    = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
 MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
 ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*

+
+// URL and E-mail syntax specifications:
+//
+//     RFC-952:  DOD INTERNET HOST TABLE SPECIFICATION
+//     RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
+//     RFC-1123: Requirements for Internet Hosts - Application and Support
+//     RFC-1738: Uniform Resource Locators (URL)
+//     RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
+//     RFC-5234: Augmented BNF for Syntax Specifications: ABNF
+//     RFC-5321: Simple Mail Transfer Protocol
+//     RFC-5322: Internet Message Format
+
+%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
+
+DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
+DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
+DomainNameLoose  = {DomainLabel} ("." {DomainLabel})*
+
+IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
+IPv4Address  = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3} 
+IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
+IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
+IPv6Address =                                                  ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
+            |                                             "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
+            |                            {IPv6Hex16Bit}?  "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::"  {IPv6Hex16Bit} ":"     {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::"                         {IPv6LeastSignificant32Bits}
+            | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::"                         {IPv6Hex16Bit}
+            | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
+
+URIunreserved = [-._~A-Za-z0-9]
+URIpercentEncoded = "%" [0-9A-Fa-f]{2}
+URIsubDelims = [!$&'()*+,;=]
+URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
+URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
+URIquery    = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
+URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
+URIport = ":" [0-9]{1,5}
+URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}  
+URIhostLoose  = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose} 
+
+URIauthorityStrict =             {URIhostStrict} {URIport}?
+URIauthorityLoose  = {URIlogin}? {URIhostLoose}  {URIport}?
+
+HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
+HTTPpath = ("/" {HTTPsegment})*
+HTTPscheme = [hH][tT][tT][pP][sS]? "://"
+HTTPurlFull = {HTTPscheme} {URIauthorityLoose}  {HTTPpath}? {URIquery}? {URIfragment}?
+// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
+HTTPurlNoScheme =          {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
+HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
+
+FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
+FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
+FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
+FTPscheme = [fF][tT][pP] "://"
+FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
+
+FILEscheme = [fF][iI][lL][eE] "://"
+FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
+
+URL = {HTTPurl} | {FTPurl} | {FILEurl}
+
+EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
+EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
+EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
+EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
+EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
+// DFA minimization allows {IPv6Address} and {IPv4Address} to be included 
+// in the {EMAILbracketedHost} definition without incurring any size penalties, 
+// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
+// The IP address regexes are included in {EMAILbracketedHost} simply as a 
+// reminder that they are acceptable bracketed host forms.
+EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
+EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
+
+
 %{
  /** Alphanumeric sequences */
  public static final String WORD_TYPE = "<ALPHANUM>";
@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
  /** Numbers */
  public static final String NUMERIC_TYPE = "<NUM>";
  
+  /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
+  public static final String URL_TYPE = "<URL>";
+  
+  /** E-mail addresses */
+  public static final String EMAIL_TYPE = "<EMAIL";
+  
  /**
   * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
   * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept 
@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
   * @param source The AttributeSource to use
   * @param input The input reader
   */
-  public UAX29Tokenizer(AttributeSource source, Reader input) {
+  public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
    super(source, input);
    zzReader = input;
  }
@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
   * @param factory The AttributeFactory to use
   * @param input The input reader
   */
-  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+  public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
    super(factory, input); 
    zzReader = input;
  }
@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]

 %%

-// WB1. 	sot 	÷ 	
-// WB2. 		÷ 	eot
+// UAX#29 WB1. 	sot 	÷ 	
+//        WB2. 		÷ 	eot
 //
 <<EOF>> { return false; }

+{URL}   { if (populateAttributes(URL_TYPE)) return true; }
+{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }

-// WB8.   Numeric × Numeric
-// WB11.  Numeric (MidNum | MidNumLet) × Numeric
-// WB12.  Numeric × (MidNum | MidNumLet) Numeric
-// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+// UAX#29 WB8.   Numeric × Numeric
+//        WB11.  Numeric (MidNum | MidNumLet) × Numeric
+//        WB12.  Numeric × (MidNum | MidNumLet) Numeric
+//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 //
 {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} 
                              | {MidNumericEx} {NumericEx} 
@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
  { if (populateAttributes(NUMERIC_TYPE)) return true; }


-// WB5.   ALetter × ALetter
-// WB6.   ALetter × (MidLetter | MidNumLet) ALetter
-// WB7.   ALetter (MidLetter | MidNumLet) × ALetter
-// WB9.   ALetter × Numeric
-// WB10.  Numeric × ALetter
-// WB13.  Katakana × Katakana
-// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+// UAX#29 WB5.   ALetter × ALetter
+//        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
+//        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
+//        WB9.   ALetter × Numeric
+//        WB10.  Numeric × ALetter
+//        WB13.  Katakana × Katakana
+//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 //
 {ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
                   | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]
 //
 \p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }

-// WB14.  Any ÷ Any
+// UAX#29 WB14.  Any ÷ Any
 //
 \p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
 \p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }


-// WB3.   CR × LF
-// WB3a.  (Newline | CR | LF) ÷
-// WB3b.  ÷ (Newline | CR | LF)
-// WB14.  Any ÷ Any
+// UAX#29 WB3.   CR × LF
+//        WB3a.  (Newline | CR | LF) ÷
+//        WB3b.  ÷ (Newline | CR | LF)
+//        WB14.  Any ÷ Any
 //
 [^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/package.html
@ -27,7 +27,10 @@
        as of Lucene 3.1, implements the Word Break rules from the Unicode Text 
        Segmentation algorithm, as specified in 
        <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
-        URLs and email addresses are also tokenized according to the relevant RFCs.
+        Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
+        <b>not</b> tokenized as single tokens, but are instead split up into 
+        tokens according to the UAX#29 word break rules.
+        <br/>
        <code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
        <code>StandardTokenizer</code>, 
        <code><a href="StandardFilter">StandardFilter</a></code>, 
@ -46,13 +49,11 @@
        <code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
        and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
    </li>
-    <li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>: 
+    <li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>: 
        implements the Word Break rules from the Unicode Text Segmentation
        algorithm, as specified in 
        <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
-        Unlike <code>StandardTokenizer</code>, URLs and email addresses are
-        <b>not</b> tokenized as single tokens, but are instead split up into 
-        tokens according to the UAX#29 word break rules.
+        URLs and email addresses are also tokenized according to the relevant RFCs.
    </li>
 </ul>
 </body>
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;

-import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.List;

 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -58,63 +51,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
    }
  };

-  /** Passes through tokens with type "<URL>" and blocks all other types. */
-  private class URLFilter extends TokenFilter {
-    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-    public URLFilter(TokenStream in) {
-      super(in);
-    }
-    @Override
-    public final boolean incrementToken() throws java.io.IOException {
-      boolean isTokenAvailable = false;
-      while (input.incrementToken()) {
-        if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
-          isTokenAvailable = true;
-          break;
-        }
-      }
-      return isTokenAvailable;
-    }
-  }
-  
-  /** Passes through tokens with type "<EMAIL>" and blocks all other types. */
-  private class EmailFilter extends TokenFilter {
-    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-    public EmailFilter(TokenStream in) {
-      super(in);
-    }
-    @Override
-    public final boolean incrementToken() throws java.io.IOException {
-      boolean isTokenAvailable = false;
-      while (input.incrementToken()) {
-        if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
-          isTokenAvailable = true;
-          break;
-        }
-      }
-      return isTokenAvailable;
-    }
-  }
-
-  private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
-      tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
-      TokenFilter filter = new URLFilter(tokenizer);
-      return new TokenStreamComponents(tokenizer, filter);
-    }
-  };
-
-  private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
-      TokenFilter filter = new EmailFilter(tokenizer);
-      return new TokenStreamComponents(tokenizer, filter);
-    }
-  };
-
  public void testArmenian() throws Exception {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
@ -261,138 +197,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
        new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
  }
  
-  public void testWikiURLs() throws Exception {
-    Reader reader = null;
-    String luceneResourcesWikiPage;
-    try {
-      reader = new InputStreamReader
-        (getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
-      StringBuilder builder = new StringBuilder();
-      char[] buffer = new char[1024];
-      int numCharsRead;
-      while (-1 != (numCharsRead = reader.read(buffer))) {
-        builder.append(buffer, 0, numCharsRead);
-      }
-      luceneResourcesWikiPage = builder.toString(); 
-    } finally {
-      if (null != reader) {
-        reader.close();
-      }
-    }
-    assertTrue(null != luceneResourcesWikiPage 
-               && luceneResourcesWikiPage.length() > 0);
-    BufferedReader bufferedReader = null;
-    String[] urls;
-    try {
-      List<String> urlList = new ArrayList<String>();
-      bufferedReader = new BufferedReader(new InputStreamReader
-        (getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
-      String line;
-      while (null != (line = bufferedReader.readLine())) {
-        line = line.trim();
-        if (line.length() > 0) {
-          urlList.add(line);
-        }
-      }
-      urls = urlList.toArray(new String[urlList.size()]);
-    } finally {
-      if (null != bufferedReader) {
-        bufferedReader.close();
-      }
-    }
-    assertTrue(null != urls && urls.length > 0);
-    BaseTokenStreamTestCase.assertAnalyzesTo
-      (urlAnalyzer, luceneResourcesWikiPage, urls);
-  }
-  
-  public void testEmails() throws Exception {
-    Reader reader = null;
-    String randomTextWithEmails;
-    try {
-      reader = new InputStreamReader
-        (getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
-      StringBuilder builder = new StringBuilder();
-      char[] buffer = new char[1024];
-      int numCharsRead;
-      while (-1 != (numCharsRead = reader.read(buffer))) {
-        builder.append(buffer, 0, numCharsRead);
-      }
-      randomTextWithEmails = builder.toString(); 
-    } finally {
-      if (null != reader) {
-        reader.close();
-      }
-    }
-    assertTrue(null != randomTextWithEmails 
-               && randomTextWithEmails.length() > 0);
-    BufferedReader bufferedReader = null;
-    String[] emails;
-    try {
-      List<String> emailList = new ArrayList<String>();
-      bufferedReader = new BufferedReader(new InputStreamReader
-        (getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
-      String line;
-      while (null != (line = bufferedReader.readLine())) {
-        line = line.trim();
-        if (line.length() > 0) {
-          emailList.add(line);
-        }
-      }
-      emails = emailList.toArray(new String[emailList.size()]);
-    } finally {
-      if (null != bufferedReader) {
-        bufferedReader.close();
-      }
-    }
-    assertTrue(null != emails && emails.length > 0);
-    BaseTokenStreamTestCase.assertAnalyzesTo
-      (emailAnalyzer, randomTextWithEmails, emails);
-  }
-
-  public void testURLs() throws Exception {
-    Reader reader = null;
-    String randomTextWithURLs;
-    try {
-      reader = new InputStreamReader
-        (getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
-      StringBuilder builder = new StringBuilder();
-      char[] buffer = new char[1024];
-      int numCharsRead;
-      while (-1 != (numCharsRead = reader.read(buffer))) {
-        builder.append(buffer, 0, numCharsRead);
-      }
-      randomTextWithURLs = builder.toString(); 
-    } finally {
-      if (null != reader) {
-        reader.close();
-      }
-    }
-    assertTrue(null != randomTextWithURLs 
-               && randomTextWithURLs.length() > 0);
-    BufferedReader bufferedReader = null;
-    String[] urls;
-    try {
-      List<String> urlList = new ArrayList<String>();
-      bufferedReader = new BufferedReader(new InputStreamReader
-        (getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
-      String line;
-      while (null != (line = bufferedReader.readLine())) {
-        line = line.trim();
-        if (line.length() > 0) {
-          urlList.add(line);
-        }
-      }
-      urls = urlList.toArray(new String[urlList.size()]);
-    } finally {
-      if (null != bufferedReader) {
-        bufferedReader.close();
-      }
-    }
-    assertTrue(null != urls && urls.length > 0);
-    BaseTokenStreamTestCase.assertAnalyzesTo
-      (urlAnalyzer, randomTextWithURLs, urls);
-  }
-
  public void testUnicodeWordBreaks() throws Exception {
    WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
    wordBreakTest.test(a);
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;

+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;

 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -28,7 +35,7 @@ import java.util.Arrays;
 * limitations under the License.
 */

-public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
+public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
  
  public void testHugeDoc() throws IOException {
    StringBuilder sb = new StringBuilder();
@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
-    UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
+    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
    BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
  }

@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
    protected TokenStreamComponents createComponents
      (String fieldName, Reader reader) {

-      Tokenizer tokenizer = new UAX29Tokenizer(reader);
+      Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
      return new TokenStreamComponents(tokenizer);
    }
  };

+
+  /** Passes through tokens with type "<URL>" and blocks all other types. */
+  private class URLFilter extends TokenFilter {
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    public URLFilter(TokenStream in) {
+      super(in);
+    }
+    @Override
+    public final boolean incrementToken() throws java.io.IOException {
+      boolean isTokenAvailable = false;
+      while (input.incrementToken()) {
+        if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
+          isTokenAvailable = true;
+          break;
+        }
+      }
+      return isTokenAvailable;
+    }
+  }
+  
+  /** Passes through tokens with type "<EMAIL>" and blocks all other types. */
+  private class EmailFilter extends TokenFilter {
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    public EmailFilter(TokenStream in) {
+      super(in);
+    }
+    @Override
+    public final boolean incrementToken() throws java.io.IOException {
+      boolean isTokenAvailable = false;
+      while (input.incrementToken()) {
+        if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
+          isTokenAvailable = true;
+          break;
+        }
+      }
+      return isTokenAvailable;
+    }
+  }
+
+  private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
+      TokenFilter filter = new URLFilter(tokenizer);
+      return new TokenStreamComponents(tokenizer, filter);
+    }
+  };
+
+  private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
+      TokenFilter filter = new EmailFilter(tokenizer);
+      return new TokenStreamComponents(tokenizer, filter);
+    }
+  };
+  
+  
  public void testArmenian() throws Exception {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
    BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  }

  public void testTextWithNumbersSA() throws Exception {
@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
        new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
  }
  
+  public void testWikiURLs() throws Exception {
+    Reader reader = null;
+    String luceneResourcesWikiPage;
+    try {
+      reader = new InputStreamReader(getClass().getResourceAsStream
+        ("LuceneResourcesWikiPage.html"), "UTF-8");
+      StringBuilder builder = new StringBuilder();
+      char[] buffer = new char[1024];
+      int numCharsRead;
+      while (-1 != (numCharsRead = reader.read(buffer))) {
+        builder.append(buffer, 0, numCharsRead);
+      }
+      luceneResourcesWikiPage = builder.toString(); 
+    } finally {
+      if (null != reader) {
+        reader.close();
+      }
+    }
+    assertTrue(null != luceneResourcesWikiPage 
+               && luceneResourcesWikiPage.length() > 0);
+    BufferedReader bufferedReader = null;
+    String[] urls;
+    try {
+      List<String> urlList = new ArrayList<String>();
+      bufferedReader = new BufferedReader(new InputStreamReader
+        (getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
+      String line;
+      while (null != (line = bufferedReader.readLine())) {
+        line = line.trim();
+        if (line.length() > 0) {
+          urlList.add(line);
+        }
+      }
+      urls = urlList.toArray(new String[urlList.size()]);
+    } finally {
+      if (null != bufferedReader) {
+        bufferedReader.close();
+      }
+    }
+    assertTrue(null != urls && urls.length > 0);
+    BaseTokenStreamTestCase.assertAnalyzesTo
+      (urlAnalyzer, luceneResourcesWikiPage, urls);
+  }
+  
+  public void testEmails() throws Exception {
+    Reader reader = null;
+    String randomTextWithEmails;
+    try {
+      reader = new InputStreamReader(getClass().getResourceAsStream
+        ("random.text.with.email.addresses.txt"), "UTF-8");
+      StringBuilder builder = new StringBuilder();
+      char[] buffer = new char[1024];
+      int numCharsRead;
+      while (-1 != (numCharsRead = reader.read(buffer))) {
+        builder.append(buffer, 0, numCharsRead);
+      }
+      randomTextWithEmails = builder.toString(); 
+    } finally {
+      if (null != reader) {
+        reader.close();
+      }
+    }
+    assertTrue(null != randomTextWithEmails 
+               && randomTextWithEmails.length() > 0);
+    BufferedReader bufferedReader = null;
+    String[] emails;
+    try {
+      List<String> emailList = new ArrayList<String>();
+      bufferedReader = new BufferedReader(new InputStreamReader
+        (getClass().getResourceAsStream
+          ("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
+      String line;
+      while (null != (line = bufferedReader.readLine())) {
+        line = line.trim();
+        if (line.length() > 0) {
+          emailList.add(line);
+        }
+      }
+      emails = emailList.toArray(new String[emailList.size()]);
+    } finally {
+      if (null != bufferedReader) {
+        bufferedReader.close();
+      }
+    }
+    assertTrue(null != emails && emails.length > 0);
+    BaseTokenStreamTestCase.assertAnalyzesTo
+      (emailAnalyzer, randomTextWithEmails, emails);
+  }
+
+  public void testURLs() throws Exception {
+    Reader reader = null;
+    String randomTextWithURLs;
+    try {
+      reader = new InputStreamReader(getClass().getResourceAsStream
+        ("random.text.with.urls.txt"), "UTF-8");
+      StringBuilder builder = new StringBuilder();
+      char[] buffer = new char[1024];
+      int numCharsRead;
+      while (-1 != (numCharsRead = reader.read(buffer))) {
+        builder.append(buffer, 0, numCharsRead);
+      }
+      randomTextWithURLs = builder.toString(); 
+    } finally {
+      if (null != reader) {
+        reader.close();
+      }
+    }
+    assertTrue(null != randomTextWithURLs 
+               && randomTextWithURLs.length() > 0);
+    BufferedReader bufferedReader = null;
+    String[] urls;
+    try {
+      List<String> urlList = new ArrayList<String>();
+      bufferedReader = new BufferedReader(new InputStreamReader
+        (getClass().getResourceAsStream
+          ("urls.from.random.text.with.urls.txt"), "UTF-8"));
+      String line;
+      while (null != (line = bufferedReader.readLine())) {
+        line = line.trim();
+        if (line.length() > 0) {
+          urlList.add(line);
+        }
+      }
+      urls = urlList.toArray(new String[urlList.size()]);
+    } finally {
+      if (null != bufferedReader) {
+        bufferedReader.close();
+      }
+    }
+    assertTrue(null != urls && urls.length > 0);
+    BaseTokenStreamTestCase.assertAnalyzesTo
+      (urlAnalyzer, randomTextWithURLs, urls);
+  }
+
  public void testUnicodeWordBreaks() throws Exception {
    WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
    wordBreakTest.test(a);
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
      assertAnalyzesToReuse(
          analyzer,
          "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
-          new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
+          new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
 	}
 	
 	/** @deprecated (3.1) for version back compat */
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -302,8 +302,10 @@ New Features
 * SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) 
  tokenizer and filters to contrib/analysis-extras (rmuir)

-* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm 
-  with good results for most languages.  (Tom Burton-West via rmuir)
+* SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
+  UAX#29, a unicode algorithm with good results for most languages, as well as
+  URL and E-mail tokenization according to the relevant RFCs.
+  (Tom Burton-West via rmuir)

 * SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)

--- a/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java
@ -20,7 +20,7 @@ package org.apache.solr.analysis;



-import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;

 import java.io.Reader;
 import java.util.Map;
@ -30,14 +30,14 @@ import java.util.Map;
 * 
 */

-public class UAX29TokenizerFactory extends BaseTokenizerFactory {
+public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
  @Override
  public void init(Map<String,String> args) {
    super.init(args);
    assureMatchVersion();
  }

-  public UAX29Tokenizer create(Reader input) {
-    return new UAX29Tokenizer(input);
+  public UAX29URLEmailTokenizer create(Reader input) {
+    return new UAX29URLEmailTokenizer(input);
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestUAX29TokenizerFactory.java
@ -1,81 +0,0 @@
-package org.apache.solr.analysis;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.Reader;
-import java.io.StringReader;
-import org.apache.lucene.analysis.Tokenizer;
-
-/**
- * A few tests based on  org.apache.lucene.analysis.TestUAX29Tokenizer;
- */
-
-public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
-  /**
-   * Test UAX29TokenizerFactory
-   */
-  public void testUAX29Tokenizer() throws Exception {
-    Reader reader = new StringReader("Wha\u0301t's this thing do?");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
-    factory.init(DEFAULT_VERSION_PARAM);
-    Tokenizer stream = factory.create(reader);
-    assertTokenStreamContents(stream, 
-        new String[] {"Wha\u0301t's", "this", "thing", "do" });
-  }
-  
-  public void testArabic() throws Exception {
-    Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
-    factory.init(DEFAULT_VERSION_PARAM);
-    Tokenizer stream = factory.create(reader);
-    assertTokenStreamContents(stream, 
-        new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
-        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
-  }
-  
-  public void testChinese() throws Exception {
-    Reader reader = new StringReader("我是中国人。 １２３４ Ｔｅｓｔｓ ");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
-    factory.init(DEFAULT_VERSION_PARAM);
-    Tokenizer stream = factory.create(reader);
-    assertTokenStreamContents(stream, 
-        new String[] {"我", "是", "中", "国", "人", "１２３４", "Ｔｅｓｔｓ"});
-  }
-  public void testKorean() throws Exception {
-    Reader reader = new StringReader("안녕하세요 한글입니다");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
-    factory.init(DEFAULT_VERSION_PARAM);
-    Tokenizer stream = factory.create(reader);
-    assertTokenStreamContents(stream, 
-        new String[] {"안녕하세요", "한글입니다"});
-  }
-    
-  public void testHyphen() throws Exception {
-    Reader reader = new StringReader("some-dashed-phrase");
-    UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
-    factory.init(DEFAULT_VERSION_PARAM);
-    Tokenizer stream = factory.create(reader);
-    assertTokenStreamContents(stream, 
-        new String[] {"some", "dashed", "phrase"});
-  }
-
-}
-    
-  
-  
-  
--- a/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java
@ -0,0 +1,155 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
+ */
+
+public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
+
+  public void testUAX29URLEmailTokenizer() throws Exception {
+    Reader reader = new StringReader("Wha\u0301t's this thing do?");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"Wha\u0301t's", "this", "thing", "do" });
+  }
+  
+  public void testArabic() throws Exception {
+    Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
+        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
+  }
+  
+  public void testChinese() throws Exception {
+    Reader reader = new StringReader("我是中国人。 １２３４ Ｔｅｓｔｓ ");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"我", "是", "中", "国", "人", "１２３４", "Ｔｅｓｔｓ"});
+  }
+
+  public void testKorean() throws Exception {
+    Reader reader = new StringReader("안녕하세요 한글입니다");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"안녕하세요", "한글입니다"});
+  }
+    
+  public void testHyphen() throws Exception {
+    Reader reader = new StringReader("some-dashed-phrase");
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] {"some", "dashed", "phrase"});
+  }
+
+  // Test with some URLs from TestUAX29URLEmailTokenizer's 
+  // urls.from.random.text.with.urls.txt
+  public void testURLs() throws Exception {
+    String textWithURLs 
+      = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on\n"
+        + " some extra\nWords thrown in here. "
+        + "http://c5-3486.bisynxu.FR/aI.YnNms/"
+        + " samba Halta gamba "
+        + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
+        + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
+        + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+        + " inter Locutio "
+        + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
+        + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
+        + " blah Sirrah woof "
+        + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
+    Reader reader = new StringReader(textWithURLs);
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] { 
+          "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on",
+          "some", "extra", "Words", "thrown", "in", "here",
+          "http://c5-3486.bisynxu.FR/aI.YnNms/",
+          "samba", "Halta", "gamba",
+          "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
+          "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
+          "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
+          "inter", "Locutio",
+          "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
+          "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
+          "blah", "Sirrah", "woof",
+          "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
+        }
+    );
+  }
+
+  // Test with some emails from TestUAX29URLEmailTokenizer's 
+  // email.addresses.from.random.text.with.email.addresses.txt
+  public void testEmails() throws Exception {
+    String textWithEmails 
+      =  " some extra\nWords thrown in here. "
+         + "dJ8ngFi@avz13m.CC\n"
+         + "kU-l6DS@[082.015.228.189]\n"
+         + "\"%U\u0012@?\\B\"@Fl2d.md"
+         + " samba Halta gamba "
+         + "Bvd#@tupjv.sn\n"
+         + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
+         + "~+Kdz@3mousnl.SE\n"
+         + " inter Locutio "
+         + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
+         + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
+         + " blah Sirrah woof "
+         + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
+         + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
+    Reader reader = new StringReader(textWithEmails);
+    UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    Tokenizer stream = factory.create(reader);
+    assertTokenStreamContents(stream, 
+        new String[] { 
+          "some", "extra", "Words", "thrown", "in", "here",
+          "dJ8ngFi@avz13m.CC",
+          "kU-l6DS@[082.015.228.189]",
+          "\"%U\u0012@?\\B\"@Fl2d.md",
+          "samba", "Halta", "gamba",
+          "Bvd#@tupjv.sn",
+          "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
+          "~+Kdz@3mousnl.SE",
+          "inter", "Locutio",
+          "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
+          "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
+          "blah", "Sirrah", "woof",
+          "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
+          "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
+        }
+    );
+  }
+}