LUCENE-5357: Upgrade StandardTokenizer and UAX29URLEmailTokenizer to Unicode 6.3; update UAX29URLEmailTokenizer's recognized top level domains in URLs and Emails from the IANA Root Zone Database.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1548595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2013-12-06 16:51:43 +00:00
parent 618f6b876d
commit d516948bbd
27 changed files with 8776 additions and 5333 deletions

View File

@ -91,6 +91,11 @@ Build
* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
* LUCENE-5357: Upgrade StandardTokenizer and UAX29URLEmailTokenizer to
Unicode 6.3; update UAX29URLEmailTokenizer's recognized top level
domains in URLs and Emails from the IANA Root Zone Database.
(Steve Rowe)
Bug fixes
* LUCENE-5285: Improved highlighting of multi-valued fields with

View File

@ -45,17 +45,13 @@
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<!-- this logic below looks duplicated with run-jflex, but its not, the regexp is different! -->
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
outdir="src/java/org/apache/lucene/analysis/charfilter"
nobak="on"/>
<!-- Remove the inappropriate JFlex-generated constructors -->
nobak="on" inputstreamctor="false"/>
<!-- Remove the inappropriate JFlex-generated constructor -->
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="\/\*\s*The following code was generated by JFlex.*"
replace="\/\* The following code was generated by JFlex. \*\/" flags=""/>
match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
replace="" flags="s"/>
</target>
<target name="generate-jflex-html-char-entities">
@ -96,15 +92,7 @@
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
<jflex file="@{dir}/@{name}.jflex"
outdir="@{dir}"
nobak="on" />
<replaceregexp file="@{dir}/@{name}.java"
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
<replaceregexp file="@{dir}/@{name}.java"
match="\/\*\s*The following code was generated by JFlex.*"
replace="\/\* The following code was generated by JFlex. \*\/" flags=""/>
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
</sequential>
</macrodef>

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex. */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
package org.apache.lucene.analysis.charfilter;
@ -152,77 +152,77 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
"\21\1\1\41\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0"+
"\4\1\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1"+
"\1\0\3\1\1\0\2\2\14\0\64\1\40\2\3\0\1\1\4\0"+
"\1\1\1\2\2\0\12\274\41\0\3\2\1\41\1\0\12\274\6\0"+
"\130\1\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0"+
"\14\2\4\0\14\2\12\0\12\274\36\1\2\0\5\1\13\0\54\1"+
"\4\0\21\2\7\1\2\2\6\0\12\274\1\2\45\0\27\1\5\2"+
"\4\0\65\1\12\2\1\0\35\2\2\0\1\2\12\274\6\0\12\274"+
"\15\0\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\274\21\0"+
"\11\2\14\0\3\2\36\1\15\2\2\1\12\274\54\1\16\2\14\0"+
"\44\1\24\2\10\0\12\274\3\0\3\1\12\274\44\1\122\0\3\2"+
"\1\0\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2"+
"\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0"+
"\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0"+
"\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0"+
"\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0"+
"\13\41\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0"+
"\1\41\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0"+
"\1\2\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0"+
"\1\1\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0"+
"\20\1\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0"+
"\57\1\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0"+
"\46\1\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0"+
"\1\2\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2"+
"\u0200\0\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0"+
"\5\1\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1"+
"\5\0\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1"+
"\112\0\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1"+
"\12\274\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0"+
"\1\2\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0"+
"\4\1\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2"+
"\27\1\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\274"+
"\6\0\22\2\6\1\3\0\1\1\4\0\12\274\34\1\10\2\2\0"+
"\27\1\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1"+
"\12\274\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0"+
"\12\274\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1"+
"\3\2\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1"+
"\2\0\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1"+
"\2\0\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0"+
"\2\2\2\0\12\274\6\0\u2ba4\1\14\0\27\1\4\0\61\1\4\0"+
"\1\170\1\223\1\103\1\165\1\136\1\214\2\0\1\160\1\153\2\0"+
"\1\120\1\210\14\0\1\105\1\127\20\0\1\122\7\0\1\256\1\112"+
"\5\0\1\143\4\0\51\120\1\110\3\120\1\124\1\220\17\0\1\133"+
"\u02c1\0\1\252\277\0\2\123\1\212\3\222\2\211\1\222\1\211\2\222"+
"\1\221\21\222\11\213\1\157\7\213\7\204\1\156\1\204\1\246\2\207"+
"\1\166\1\246\1\207\1\166\10\246\2\167\5\203\2\155\5\203\1\107"+
"\10\202\5\154\3\224\12\251\20\224\3\225\32\227\1\226\2\200\2\234"+
"\1\235\2\234\2\235\2\234\1\235\3\200\1\177\2\200\12\250\1\247"+
"\1\176\1\171\7\176\1\171\13\176\31\200\7\176\12\250\1\176\5\134"+
"\3\245\3\142\1\140\4\142\2\140\10\142\1\140\7\141\1\137\2\141"+
"\7\142\16\245\1\135\4\245\1\106\4\244\1\106\5\255\1\254\1\255"+
"\3\254\7\255\1\254\23\255\5\264\3\255\6\264\2\255\6\253\5\263"+
"\3\262\2\142\7\257\36\142\4\257\5\142\5\245\6\244\2\245\1\244"+
"\4\141\13\253\12\244\26\253\15\134\1\243\2\134\1\152\3\237\1\134"+
"\2\237\5\151\4\237\4\152\1\151\3\152\1\151\5\152\2\147\1\116"+
"\2\147\1\116\1\147\2\116\1\147\1\116\12\147\1\116\4\146\1\115"+
"\1\236\1\240\1\150\3\164\1\240\2\164\1\260\2\261\2\164\1\150"+
"\1\164\1\150\1\164\1\150\1\164\3\150\1\164\2\150\1\164\1\150"+
"\2\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150"+
"\1\162\2\145\1\162\1\145\2\162\4\145\1\162\7\145\1\162\4\145"+
"\1\162\4\145\1\164\1\150\1\164\12\216\1\217\21\216\1\217\3\215"+
"\1\217\3\216\1\217\1\216\2\144\2\216\1\217\15\241\4\201\4\206"+
"\1\242\1\161\10\242\7\206\6\164\4\113\1\121\37\113\1\121\4\113"+
"\25\174\1\131\11\174\21\130\5\174\1\104\12\117\5\174\6\205\4\162"+
"\1\163\1\130\5\231\12\232\17\231\1\125\3\114\14\230\1\126\11\173"+
"\1\172\5\173\4\233\13\175\2\132\11\173\1\172\31\173\1\172\4\126"+
"\4\173\2\172\2\265\1\111\5\265\52\111\u1900\0\u016e\1\2\0\152\1"+
"\46\0\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1"+
"\1\0\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1"+
"\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2"+
"\20\0\7\2\14\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1"+
"\23\0\12\274\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1"+
"\3\0\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0";
"\1\1\1\2\2\0\12\274\41\0\3\2\2\0\12\274\6\0\130\1"+
"\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
"\4\0\14\2\12\0\12\274\36\1\2\0\5\1\13\0\54\1\4\0"+
"\21\2\7\1\2\2\6\0\12\274\1\2\45\0\27\1\5\2\4\0"+
"\65\1\12\2\1\0\35\2\2\0\1\2\12\274\6\0\12\274\15\0"+
"\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\274\21\0\11\2"+
"\14\0\3\2\36\1\15\2\2\1\12\274\54\1\16\2\14\0\44\1"+
"\24\2\10\0\12\274\3\0\3\1\12\274\44\1\122\0\3\2\1\0"+
"\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2\25\0"+
"\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1"+
"\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1"+
"\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1"+
"\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41"+
"\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41"+
"\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2"+
"\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1"+
"\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1"+
"\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1"+
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1"+
"\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2"+
"\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
"\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0"+
"\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1"+
"\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0"+
"\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0"+
"\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\274"+
"\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0\1\2"+
"\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
"\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2\27\1"+
"\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\274\6\0"+
"\22\2\6\1\3\0\1\1\4\0\12\274\34\1\10\2\2\0\27\1"+
"\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\274"+
"\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\274"+
"\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1\3\2"+
"\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1\2\0"+
"\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1\2\0"+
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
"\2\0\12\274\6\0\u2ba4\1\14\0\27\1\4\0\61\1\4\0\1\170"+
"\1\223\1\103\1\165\1\136\1\214\2\0\1\160\1\153\2\0\1\120"+
"\1\210\14\0\1\105\1\127\20\0\1\122\7\0\1\256\1\112\5\0"+
"\1\143\4\0\51\120\1\110\3\120\1\124\1\220\17\0\1\133\u02c1\0"+
"\1\252\277\0\2\123\1\212\3\222\2\211\1\222\1\211\2\222\1\221"+
"\21\222\11\213\1\157\7\213\7\204\1\156\1\204\1\246\2\207\1\166"+
"\1\246\1\207\1\166\10\246\2\167\5\203\2\155\5\203\1\107\10\202"+
"\5\154\3\224\12\251\20\224\3\225\32\227\1\226\2\200\2\234\1\235"+
"\2\234\2\235\2\234\1\235\3\200\1\177\2\200\12\250\1\247\1\176"+
"\1\171\7\176\1\171\13\176\31\200\7\176\12\250\1\176\5\134\3\245"+
"\3\142\1\140\4\142\2\140\10\142\1\140\7\141\1\137\2\141\7\142"+
"\16\245\1\135\4\245\1\106\4\244\1\106\5\255\1\254\1\255\3\254"+
"\7\255\1\254\23\255\5\264\3\255\6\264\2\255\6\253\5\263\3\262"+
"\2\142\7\257\36\142\4\257\5\142\5\245\6\244\2\245\1\244\4\141"+
"\13\253\12\244\26\253\15\134\1\243\2\134\1\152\3\237\1\134\2\237"+
"\5\151\4\237\4\152\1\151\3\152\1\151\5\152\2\147\1\116\2\147"+
"\1\116\1\147\2\116\1\147\1\116\12\147\1\116\4\146\1\115\1\236"+
"\1\240\1\150\3\164\1\240\2\164\1\260\2\261\2\164\1\150\1\164"+
"\1\150\1\164\1\150\1\164\3\150\1\164\2\150\1\164\1\150\2\164"+
"\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\162"+
"\2\145\1\162\1\145\2\162\4\145\1\162\7\145\1\162\4\145\1\162"+
"\4\145\1\164\1\150\1\164\12\216\1\217\21\216\1\217\3\215\1\217"+
"\3\216\1\217\1\216\2\144\2\216\1\217\15\241\4\201\4\206\1\242"+
"\1\161\10\242\7\206\6\164\4\113\1\121\37\113\1\121\4\113\25\174"+
"\1\131\11\174\21\130\5\174\1\104\12\117\5\174\6\205\4\162\1\163"+
"\1\130\5\231\12\232\17\231\1\125\3\114\14\230\1\126\11\173\1\172"+
"\5\173\4\233\13\175\2\132\11\173\1\172\31\173\1\172\4\126\4\173"+
"\2\172\2\265\1\111\5\265\52\111\u1900\0\u016e\1\2\0\152\1\46\0"+
"\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1\1\0"+
"\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1\41\0"+
"\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2\20\0"+
"\7\2\14\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1\23\0"+
"\12\274\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1\3\0"+
"\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0";
/**
* Translates characters to character classes
@ -30895,6 +30895,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
/**
* Unpacks the compressed character translation table.
*
@ -30905,7 +30906,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 2778) {
while (i < 2776) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);

View File

@ -34,7 +34,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
*/
%%
%unicode 6.1
%unicode 6.3
%apiprivate
%type int
%final

View File

@ -1,11 +1,12 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@ -13,10 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Saturday, July 14, 2012 4:34:14 AM UTC
// generated on Sunday, July 15, 2012 12:59:44 AM UTC
// file version from Friday, December 6, 2013 4:34:10 AM UTC
// generated on Friday, December 6, 2013 3:21:59 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
@ -49,6 +49,7 @@ ASCIITLD = "." (
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][kK][eE]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
@ -62,6 +63,7 @@ ASCIITLD = "." (
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][mM][eE][rR][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
@ -71,10 +73,13 @@ ASCIITLD = "." (
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][lL][oO][tT][hH][iI][nN][gG]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][nN][sS][tT][rR][uU][cC][tT][iI][oO][nN]
| [cC][oO][nN][tT][rR][aA][cC][tT][oO][rR][sS]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
@ -84,6 +89,8 @@ ASCIITLD = "." (
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][iI][aA][mM][oO][nN][dD][sS]
| [dD][iI][rR][eE][cC][tT][oO][rR][yY]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
@ -93,8 +100,11 @@ ASCIITLD = "." (
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][nN][tT][eE][rR][pP][rR][iI][sS][eE][sS]
| [eE][qQ][uU][iI][pP][mM][eE][nN][tT]
| [eE][rR]
| [eE][sS]
| [eE][sS][tT][aA][tT][eE]
| [eE][tT]
| [eE][uU]
| [fF][iI]
@ -104,6 +114,7 @@ ASCIITLD = "." (
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][aA][lL][lL][eE][rR][yY]
| [gG][bB]
| [gG][dD]
| [gG][eE]
@ -118,14 +129,17 @@ ASCIITLD = "." (
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][rR][aA][pP][hH][iI][cC][sS]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][uU][rR][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][oO][lL][dD][iI][nN][gG][sS]
| [hH][rR]
| [hH][tT]
| [hH][uU]
@ -150,6 +164,7 @@ ASCIITLD = "." (
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][iI][tT][cC][hH][eE][nN]
| [kK][mM]
| [kK][nN]
| [kK][pP]
@ -158,9 +173,11 @@ ASCIITLD = "." (
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][aA][nN][dD]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][iI][gG][hH][tT][iI][nN][gG]
| [lL][kK]
| [lL][rR]
| [lL][sS]
@ -172,6 +189,7 @@ ASCIITLD = "." (
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][eE][nN][uU]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
@ -214,10 +232,13 @@ ASCIITLD = "." (
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][hH][oO][tT][oO][gG][rR][aA][pP][hH][yY]
| [pP][kK]
| [pP][lL]
| [pP][lL][uU][mM][bB][iI][nN][gG]
| [pP][mM]
| [pP][nN]
| [pP][oO][sS][tT]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
@ -235,9 +256,11 @@ ASCIITLD = "." (
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][eE][xX][yY]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][iI][nN][gG][lL][eE][sS]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
@ -251,18 +274,22 @@ ASCIITLD = "." (
| [sS][xX]
| [sS][yY]
| [sS][zZ]
| [tT][aA][tT][tT][oO][oO]
| [tT][cC]
| [tT][dD]
| [tT][eE][cC][hH][nN][oO][lL][oO][gG][yY]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][iI][pP][sS]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][oO][dD][aA][yY]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
@ -273,61 +300,62 @@ ASCIITLD = "." (
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][nN][oO]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][eE][nN][tT][uU][rR][eE][sS]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][oO][yY][aA][gG][eE]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--80[aA][oO]21[aA]
| [xX][nN]--80[aA][sS][eE][hH][dD][bB]
| [xX][nN]--80[aA][sS][wW][gG]
| [xX][nN]--90[aA]3[aA][cC]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]1[aA][mM][hH]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL]1[aA][cC][cC]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
| [xX][nN]--[mM][gG][bB]9[aA][wW][bB][fF]
| [xX][nN]--[mM][gG][bB][aA]3[aA]4[fF]16[aA]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[mM][gG][bB][xX]4[cC][dD]0[aA][bB]
| [xX][nN]--[nN][gG][bB][cC]5[aA][zZ][dD]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[qQ]9[jJ][yY][bB]4[cC]
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
| [xX][nN]--[uU][nN][uU][pP]4[yY]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[wW][gG][bB][lL]6[aA]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [xX][xX][xX]
| [yY][eE]
| [yY][tT]

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex. */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
package org.apache.lucene.analysis.standard;
@ -58,64 +58,63 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
"\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12\34\0\136\12"+
"\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12\11\0\1\12"+
"\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12\1\0\24\12"+
"\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12\12\0\71\12"+
"\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12\67\0\46\12"+
"\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12\56\0\32\12"+
"\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12\17\0\2\12"+
"\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0\46\12\u015f\0"+
"\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2\25\0"+
"\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12\3\0"+
"\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12\23\0\6\12"+
"\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12\1\0\2\12"+
"\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2\2\0\3\12"+
"\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12\1\0\7\12"+
"\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0\1\12"+
"\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12"+
"\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12"+
"\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12\3\0\2\12"+
"\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12\3\0\10\12"+
"\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12\1\0\27\12"+
"\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2\25\0\10\12"+
"\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\44\0\1\12"+
"\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12"+
"\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12\3\0\30\12"+
"\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1\60\12\1\1"+
"\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0\1\12\2\0"+
"\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0\7\12\1\0"+
"\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0\4\12\1\0"+
"\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0\12\2\2\0"+
"\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0\42\12\35\0"+
"\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0\12\2\6\0"+
"\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0\104\12\5\0"+
"\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0\4\12\2\0"+
"\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0\1\12\1\0"+
"\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
"\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0\27\12\1\0"+
"\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\47\12\1\0"+
"\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0"+
"\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0\12\2\6\0"+
"\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0\26\12\2\0"+
"\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0\1\12\1\0"+
"\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0\7\12\1\0"+
"\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0\6\12\4\0"+
"\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0\1\12\4\0"+
"\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0\1\12\1\0"+
"\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0\7\12\u0ecb\0"+
"\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13\2\13\132\13"+
"\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
"\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12\5\0\1\12"+
"\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12\1\0\2\12"+
"\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12\2\0\66\12"+
"\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12\23\0\12\2"+
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
"\46\0\1\5\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0"+
"\1\6\32\12\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12"+
"\4\0\1\12\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12"+
"\34\0\136\12\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12"+
"\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12"+
"\1\0\24\12\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12"+
"\12\0\71\12\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12"+
"\67\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"+
"\56\0\32\12\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12"+
"\17\0\2\12\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0"+
"\46\12\u015f\0\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0"+
"\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0"+
"\1\12\3\0\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12"+
"\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12"+
"\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2"+
"\2\0\3\12\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12"+
"\1\0\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12"+
"\17\0\1\12\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12"+
"\1\0\7\12\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12"+
"\1\0\3\12\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12"+
"\3\0\2\12\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12"+
"\3\0\10\12\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12"+
"\1\0\27\12\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2"+
"\25\0\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12"+
"\44\0\1\12\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12"+
"\1\0\27\12\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12"+
"\3\0\30\12\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1"+
"\60\12\1\1\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0"+
"\1\12\2\0\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0"+
"\7\12\1\0\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0"+
"\4\12\1\0\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0"+
"\12\2\2\0\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0"+
"\42\12\35\0\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0"+
"\12\2\6\0\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0"+
"\104\12\5\0\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0"+
"\4\12\2\0\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0"+
"\1\12\1\0\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0"+
"\7\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0"+
"\27\12\1\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
"\47\12\1\0\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0"+
"\10\12\12\0\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0"+
"\12\2\6\0\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0"+
"\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"+
"\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"+
"\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"+
"\6\12\4\0\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0"+
"\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0"+
"\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0"+
"\7\12\u0ecb\0\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13"+
"\2\13\132\13\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0"+
"\30\12\70\0\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13"+
"\132\13\u048d\12\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12"+
"\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"+
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
"\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"+
"\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"+
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
/**
* Translates characters to character classes
@ -128,13 +127,12 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
"\1\4";
"\1\0\1\1\3\2\1\3\13\0\1\2\3\4\2\0"+
"\1\5\1\0\1\5\3\4\6\5\1\6\1\4\2\7"+
"\1\10\1\0\1\10\3\0\2\10\1\11\1\12\1\4";
private static int [] zzUnpackAction() {
int [] result = new int[51];
int [] result = new int[50];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -159,16 +157,16 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
"\0\u0268\0\u0276\0\u0284";
"\0\0\0\14\0\30\0\44\0\60\0\14\0\74\0\110"+
"\0\124\0\140\0\154\0\170\0\204\0\220\0\234\0\250"+
"\0\264\0\300\0\314\0\330\0\344\0\360\0\374\0\u0108"+
"\0\u0114\0\u0120\0\u012c\0\u0138\0\u0144\0\u0150\0\u015c\0\u0168"+
"\0\u0174\0\u0180\0\u018c\0\u0198\0\u01a4\0\250\0\u01b0\0\u01bc"+
"\0\u01c8\0\u01d4\0\u01e0\0\u01ec\0\u01f8\0\74\0\154\0\u0204"+
"\0\u0210\0\u021c";
private static int [] zzUnpackRowMap() {
int [] result = new int[51];
int [] result = new int[50];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -191,49 +189,49 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
"\1\11\2\52\1\0\1\24\3\0";
"\1\2\1\3\1\4\7\2\1\5\1\6\15\0\2\3"+
"\1\0\1\7\1\0\1\10\2\11\1\12\1\3\2\0"+
"\1\3\1\4\1\0\1\13\1\0\1\10\2\14\1\15"+
"\1\4\2\0\1\3\1\4\1\16\1\17\1\20\1\21"+
"\2\11\1\12\1\22\2\0\1\23\1\24\7\0\1\25"+
"\2\0\2\26\7\0\1\26\2\0\1\27\1\30\7\0"+
"\1\31\3\0\1\32\7\0\1\12\2\0\1\33\1\34"+
"\7\0\1\35\2\0\1\36\1\37\7\0\1\40\2\0"+
"\1\41\1\42\7\0\1\43\13\0\1\44\2\0\1\23"+
"\1\24\7\0\1\45\13\0\1\46\2\0\2\26\7\0"+
"\1\47\2\0\1\3\1\4\1\16\1\7\1\20\1\21"+
"\2\11\1\12\1\22\2\0\2\23\1\0\1\50\1\0"+
"\1\10\2\51\1\0\1\23\2\0\1\23\1\24\1\0"+
"\1\52\1\0\1\10\2\53\1\54\1\24\2\0\1\23"+
"\1\24\1\0\1\50\1\0\1\10\2\51\1\0\1\25"+
"\2\0\2\26\1\0\1\55\2\0\1\55\2\0\1\26"+
"\2\0\2\27\1\0\1\51\1\0\1\10\2\51\1\0"+
"\1\27\2\0\1\27\1\30\1\0\1\53\1\0\1\10"+
"\2\53\1\54\1\30\2\0\1\27\1\30\1\0\1\51"+
"\1\0\1\10\2\51\1\0\1\31\3\0\1\32\1\0"+
"\1\54\2\0\3\54\1\32\2\0\2\33\1\0\1\56"+
"\1\0\1\10\2\11\1\12\1\33\2\0\1\33\1\34"+
"\1\0\1\57\1\0\1\10\2\14\1\15\1\34\2\0"+
"\1\33\1\34\1\0\1\56\1\0\1\10\2\11\1\12"+
"\1\35\2\0\2\36\1\0\1\11\1\0\1\10\2\11"+
"\1\12\1\36\2\0\1\36\1\37\1\0\1\14\1\0"+
"\1\10\2\14\1\15\1\37\2\0\1\36\1\37\1\0"+
"\1\11\1\0\1\10\2\11\1\12\1\40\2\0\2\41"+
"\1\0\1\12\2\0\3\12\1\41\2\0\1\41\1\42"+
"\1\0\1\15\2\0\3\15\1\42\2\0\1\41\1\42"+
"\1\0\1\12\2\0\3\12\1\43\4\0\1\16\6\0"+
"\1\44\2\0\1\23\1\24\1\0\1\60\1\0\1\10"+
"\2\51\1\0\1\25\2\0\2\26\1\0\1\55\2\0"+
"\1\55\2\0\1\47\2\0\2\23\7\0\1\23\2\0"+
"\2\27\7\0\1\27\2\0\2\33\7\0\1\33\2\0"+
"\2\36\7\0\1\36\2\0\2\41\7\0\1\41\2\0"+
"\2\61\7\0\1\61\2\0\2\23\7\0\1\62\2\0"+
"\2\61\1\0\1\55\2\0\1\55\2\0\1\61\2\0"+
"\2\23\1\0\1\60\1\0\1\10\2\51\1\0\1\23"+
"\1\0";
private static int [] zzUnpackTrans() {
int [] result = new int[658];
int [] result = new int[552];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -271,11 +269,11 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
"\1\0\1\11\3\1\1\11\13\0\4\1\2\0\1\1"+
"\1\0\17\1\1\0\1\1\3\0\5\1";
private static int [] zzUnpackAttribute() {
int [] result = new int[51];
int [] result = new int[50];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -372,7 +370,6 @@ public final void getText(CharTermAttribute t) {
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
@ -381,7 +378,6 @@ public final void getText(CharTermAttribute t) {
}
/**
* Unpacks the compressed character translation table.
*
@ -392,7 +388,7 @@ public final void getText(CharTermAttribute t) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 1154) {
while (i < 1138) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);

View File

@ -116,8 +116,6 @@ LETTER = !(![:letter:]|{CJ})
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
WHITESPACE = \r\n | [ \r\n\t\f]
%%
{ALPHANUM} { return ALPHANUM; }
@ -131,4 +129,4 @@ WHITESPACE = \r\n | [ \r\n\t\f]
{ACRONYM_DEP} { return ACRONYM_DEP; }
/** Ignore the rest */
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }

View File

@ -18,4 +18,4 @@
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
and need to regenerate the tokenizer, only use the trunk version
of JFlex 1.5 (with a minimum SVN revision 607) at the moment!
of JFlex 1.5 (with a minimum SVN revision 722) at the moment!

View File

@ -1,11 +1,12 @@
/*
* Copyright 2010 The Apache Software Foundation.
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@ -13,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
@ -39,6 +39,12 @@ FormatSupp = (
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
NumericSupp = (
([\ud805][\uDEC0-\uDEC9])
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
ExtendSupp = (
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
| ([\ud805][\uDEAB-\uDEB7])
@ -48,12 +54,6 @@ ExtendSupp = (
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud805][\uDEC0-\uDEC9])
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
@ -129,3 +129,15 @@ HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)
SingleQuoteSupp = (
[]
)
DoubleQuoteSupp = (
[]
)
HebrewLetterSupp = (
[]
)
RegionalIndicatorSupp = (
([\ud83c][\uDDE6-\uDDFF])
)

View File

@ -32,11 +32,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* </ul>
*/
%%
%unicode 6.1
%unicode 6.3
%integer
%final
%public
@ -47,33 +49,40 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%buffer 4096
%include SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
ALetter = (\p{WB:ALetter} | {ALetterSupp})
Format = (\p{WB:Format} | {FormatSupp})
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
Extend = (\p{WB:Extend} | {ExtendSupp})
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
MidNum = (\p{WB:MidNum} | {MidNumSupp})
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
Han = (\p{Script:Han} | {HanSupp})
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
HebrewOrALetter = ({HebrewLetter} | {ALetter})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
NumericEx = {Numeric} ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
%{
/** Alphanumeric sequences */
@ -121,15 +130,12 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
@ -139,21 +145,31 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}* ( {KatakanaEx}
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx}
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
)*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
@ -166,7 +182,7 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -188,6 +204,8 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -35,11 +35,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* </ul>
*/
%%
%unicode 6.1
%unicode 6.3
%integer
%final
%public
@ -50,33 +52,39 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%buffer 4096
%include SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
ALetter = (\p{WB:ALetter} | {ALetterSupp})
Format = (\p{WB:Format} | {FormatSupp})
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
Extend = (\p{WB:Extend} | {ExtendSupp})
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
MidNum = (\p{WB:MidNum} | {MidNumSupp})
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
Han = (\p{Script:Han} | {HanSupp})
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
HebrewOrALetter = ({HebrewLetter} | {ALetter})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
NumericEx = {Numeric} ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
@ -213,15 +221,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
@ -231,21 +236,31 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
)*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
@ -258,7 +273,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -280,6 +295,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex. */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
package org.apache.lucene.analysis.wikipedia;
@ -84,21 +84,20 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
"\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
"\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
"\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
"\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
"\1\11\1\0\1\34\1\35\1\36\1\0\1\37\1\0"+
"\1\40\3\0\1\41\1\42\2\43\1\42\2\44\2\0"+
"\1\43\1\0\14\43\1\42\3\0\1\11\1\45\3\0"+
"\1\46\1\47\5\0\1\50\4\0\1\50\2\0\2\50"+
"\2\0\1\11\5\0\1\31\1\42\1\43\1\51\3\0"+
"\1\11\2\0\1\52\30\0\1\53\2\0\1\54\1\55"+
"\1\56";
"\12\0\4\1\4\2\1\3\1\4\1\1\2\5\1\6"+
"\1\5\1\7\1\5\2\10\1\11\1\5\1\12\1\11"+
"\1\13\1\14\1\15\1\16\1\15\1\17\1\20\1\10"+
"\1\21\1\10\4\22\1\23\1\24\1\25\1\26\3\0"+
"\1\27\14\0\1\30\1\31\1\32\1\33\1\11\1\0"+
"\1\34\1\35\1\36\1\0\1\37\1\0\1\40\3\0"+
"\1\41\1\42\2\43\1\42\2\44\2\0\1\43\1\0"+
"\14\43\1\42\3\0\1\11\1\45\3\0\1\46\1\47"+
"\5\0\1\50\4\0\1\50\2\0\2\50\2\0\1\11"+
"\5\0\1\31\1\42\1\43\1\51\3\0\1\11\2\0"+
"\1\52\30\0\1\53\2\0\1\54\1\55\1\56";
private static int [] zzUnpackAction() {
int [] result = new int[184];
int [] result = new int[181];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -125,30 +124,30 @@ class WikipediaTokenizerImpl {
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
"\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
"\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
"\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
"\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
"\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
"\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u01b8\0\u0b2c"+
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u01b8\0\u0370\0\u039c"+
"\0\u03c8\0\u03f4\0\u0420\0\u01b8\0\u0370\0\u044c\0\u0478\0\u01b8"+
"\0\u04a4\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
"\0\u0604\0\u0630\0\u065c\0\u01b8\0\u0688\0\u0370\0\u06b4\0\u06e0"+
"\0\u070c\0\u01b8\0\u01b8\0\u0738\0\u0764\0\u0790\0\u01b8\0\u07bc"+
"\0\u07e8\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u01b8\0\u01b8\0\u0a24"+
"\0\u0a50\0\u0a7c\0\u0a7c\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c"+
"\0\u0b58\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c"+
"\0\u0cb8\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
"\0\u0814\0\u0cb8\0\u0ce4\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
"\0\u0dec\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20"+
"\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080"+
"\0\u10ac\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8"+
"\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u01b8"+
"\0\u1080\0\u10ac\0\u10d8\0\u1104\0\u01b8\0\u1130\0\u115c\0\u1188"+
"\0\u11b4\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8"+
"\0\u1314\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0"+
"\0\u141c\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8"+
"\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684"+
"\0\u16b0\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
"\0\u1314\0\u1340\0\u07e8\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0"+
"\0\u141c\0\u1448\0\u1474\0\u14a0\0\u01b8\0\u14cc\0\u14f8\0\u1524"+
"\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u01b8\0\u1658"+
"\0\u1684\0\u16b0\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
"\0\u17e4\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918"+
"\0\u1944\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78"+
"\0\u1aa4\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
"\0\u1aa4\0\u1ad0\0\u01b8\0\u01b8\0\u01b8";
private static int [] zzUnpackRowMap() {
int [] result = new int[184];
int [] result = new int[181];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -172,152 +171,149 @@ class WikipediaTokenizerImpl {
private static final String ZZ_TRANS_PACKED_0 =
"\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
"\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
"\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
"\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
"\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
"\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
"\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
"\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
"\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
"\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
"\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
"\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
"\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
"\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
"\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
"\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
"\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
"\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
"\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
"\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
"\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
"\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
"\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
"\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
"\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
"\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
"\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
"\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
"\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
"\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
"\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
"\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
"\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
"\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
"\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\117"+
"\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\120"+
"\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
"\7\0\15\54\3\0\3\54\47\0\1\117\6\0\1\121"+
"\63\0\1\122\57\0\4\62\7\0\15\62\3\0\3\62"+
"\24\0\1\56\55\0\1\123\43\0\4\70\7\0\15\70"+
"\3\0\3\70\14\0\1\36\1\0\4\124\1\0\3\125"+
"\3\0\15\124\3\0\3\124\14\0\1\36\1\0\4\124"+
"\1\0\3\125\3\0\3\124\1\126\11\124\3\0\3\124"+
"\16\0\1\127\1\0\1\127\10\0\15\127\3\0\3\127"+
"\16\0\1\130\1\131\1\132\1\133\7\0\15\130\3\0"+
"\3\130\16\0\1\134\1\0\1\134\10\0\15\134\3\0"+
"\3\134\16\0\1\135\1\136\1\135\1\136\7\0\15\135"+
"\3\0\3\135\16\0\1\137\2\140\1\141\7\0\15\137"+
"\3\0\3\137\16\0\1\100\2\142\10\0\15\100\3\0"+
"\3\100\16\0\1\143\2\144\1\145\7\0\15\143\3\0"+
"\3\143\16\0\4\136\7\0\15\136\3\0\3\136\16\0"+
"\1\146\2\147\1\150\7\0\15\146\3\0\3\146\16\0"+
"\1\151\2\152\1\153\7\0\15\151\3\0\3\151\16\0"+
"\1\154\1\144\1\155\1\145\7\0\15\154\3\0\3\154"+
"\16\0\1\156\2\131\1\133\7\0\15\156\3\0\3\156"+
"\30\0\1\157\1\160\64\0\1\161\27\0\4\40\7\0"+
"\2\40\1\162\12\40\3\0\3\40\2\0\1\163\101\0"+
"\1\164\1\165\40\0\4\70\7\0\6\70\1\166\6\70"+
"\3\0\3\70\2\0\1\167\63\0\1\170\71\0\1\171"+
"\1\172\34\0\1\173\1\0\1\36\1\0\4\124\1\0"+
"\3\125\3\0\15\124\3\0\3\124\16\0\4\174\1\0"+
"\3\125\3\0\15\174\3\0\3\174\12\0\1\173\1\0"+
"\1\36\1\0\4\124\1\0\3\125\3\0\10\124\1\175"+
"\4\124\3\0\3\124\2\0\1\73\13\0\1\127\1\0"+
"\1\127\10\0\15\127\3\0\3\127\3\0\1\176\1\0"+
"\1\102\2\177\6\0\1\130\1\131\1\132\1\133\7\0"+
"\15\130\3\0\3\130\3\0\1\200\1\0\1\102\2\201"+
"\1\0\1\202\3\0\1\202\3\131\1\133\7\0\15\131"+
"\3\0\3\131\3\0\1\203\1\0\1\102\2\201\1\0"+
"\1\202\3\0\1\202\1\132\1\131\1\132\1\133\7\0"+
"\15\132\3\0\3\132\3\0\1\204\1\0\1\102\2\177"+
"\6\0\4\133\7\0\15\133\3\0\3\133\3\0\1\205"+
"\2\0\1\205\7\0\1\135\1\136\1\135\1\136\7\0"+
"\15\135\3\0\3\135\3\0\1\205\2\0\1\205\7\0"+
"\4\136\7\0\15\136\3\0\3\136\3\0\1\177\1\0"+
"\1\102\2\177\6\0\1\137\2\140\1\141\7\0\15\137"+
"\3\0\3\137\3\0\1\201\1\0\1\102\2\201\1\0"+
"\1\202\3\0\1\202\3\140\1\141\7\0\15\140\3\0"+
"\3\140\3\0\1\177\1\0\1\102\2\177\6\0\4\141"+
"\7\0\15\141\3\0\3\141\3\0\1\202\2\0\2\202"+
"\1\0\1\202\3\0\1\202\3\142\10\0\15\142\3\0"+
"\3\142\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
"\3\0\1\100\1\143\2\144\1\145\7\0\15\143\3\0"+
"\3\143\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
"\3\0\1\104\3\144\1\145\7\0\15\144\3\0\3\144"+
"\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
"\1\100\4\145\7\0\15\145\3\0\3\145\3\0\1\77"+
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\146"+
"\2\147\1\150\7\0\15\146\3\0\3\146\3\0\1\103"+
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\147"+
"\1\150\7\0\15\147\3\0\3\147\3\0\1\77\1\0"+
"\1\102\2\77\1\0\1\100\3\0\1\100\4\150\7\0"+
"\15\150\3\0\3\150\3\0\1\100\2\0\2\100\1\0"+
"\1\100\3\0\1\100\1\151\2\152\1\153\7\0\15\151"+
"\3\0\3\151\3\0\1\104\2\0\2\104\1\0\1\104"+
"\3\0\1\104\3\152\1\153\7\0\15\152\3\0\3\152"+
"\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
"\4\153\7\0\15\153\3\0\3\153\3\0\1\206\1\0"+
"\1\102\2\77\1\0\1\100\3\0\1\100\1\154\1\144"+
"\1\155\1\145\7\0\15\154\3\0\3\154\3\0\1\207"+
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\155"+
"\1\144\1\155\1\145\7\0\15\155\3\0\3\155\3\0"+
"\1\204\1\0\1\102\2\177\6\0\1\156\2\131\1\133"+
"\7\0\15\156\3\0\3\156\31\0\1\160\54\0\1\210"+
"\64\0\1\211\26\0\4\40\7\0\15\40\3\0\1\40"+
"\1\212\1\40\31\0\1\165\54\0\1\213\35\0\1\36"+
"\1\0\4\124\1\0\3\125\3\0\3\124\1\214\11\124"+
"\3\0\3\124\2\0\1\215\102\0\1\172\54\0\1\216"+
"\34\0\1\217\52\0\1\173\3\0\4\174\7\0\15\174"+
"\3\0\3\174\12\0\1\173\1\0\1\220\1\0\4\124"+
"\1\0\3\125\3\0\15\124\3\0\3\124\16\0\1\221"+
"\1\133\1\221\1\133\7\0\15\221\3\0\3\221\16\0"+
"\4\141\7\0\15\141\3\0\3\141\16\0\4\145\7\0"+
"\15\145\3\0\3\145\16\0\4\150\7\0\15\150\3\0"+
"\3\150\16\0\4\153\7\0\15\153\3\0\3\153\16\0"+
"\1\222\1\145\1\222\1\145\7\0\15\222\3\0\3\222"+
"\16\0\4\133\7\0\15\133\3\0\3\133\16\0\4\223"+
"\7\0\15\223\3\0\3\223\33\0\1\224\61\0\1\225"+
"\30\0\4\40\6\0\1\226\15\40\3\0\2\40\1\227"+
"\33\0\1\230\32\0\1\173\1\0\1\36\1\0\4\124"+
"\1\0\3\125\3\0\10\124\1\231\4\124\3\0\3\124"+
"\2\0\1\232\104\0\1\233\36\0\4\234\7\0\15\234"+
"\3\0\3\234\3\0\1\176\1\0\1\102\2\177\6\0"+
"\1\221\1\133\1\221\1\133\7\0\15\221\3\0\3\221"+
"\3\0\1\206\1\0\1\102\2\77\1\0\1\100\3\0"+
"\1\100\1\222\1\145\1\222\1\145\7\0\15\222\3\0"+
"\3\222\3\0\1\205\2\0\1\205\7\0\4\223\7\0"+
"\15\223\3\0\3\223\34\0\1\235\55\0\1\236\26\0"+
"\1\237\60\0\4\40\6\0\1\226\15\40\3\0\3\40"+
"\34\0\1\240\31\0\1\173\1\0\1\117\1\0\4\124"+
"\1\0\3\125\3\0\15\124\3\0\3\124\34\0\1\241"+
"\32\0\1\242\2\0\4\234\7\0\15\234\3\0\3\234"+
"\35\0\1\243\62\0\1\244\20\0\1\245\77\0\1\246"+
"\53\0\1\247\32\0\1\36\1\0\4\174\1\0\3\125"+
"\3\0\15\174\3\0\3\174\36\0\1\250\53\0\1\251"+
"\33\0\4\252\7\0\15\252\3\0\3\252\36\0\1\253"+
"\53\0\1\254\54\0\1\255\61\0\1\256\11\0\1\257"+
"\12\0\4\252\7\0\15\252\3\0\3\252\37\0\1\260"+
"\53\0\1\261\54\0\1\262\22\0\1\13\62\0\4\263"+
"\7\0\15\263\3\0\3\263\40\0\1\264\53\0\1\265"+
"\43\0\1\266\26\0\2\263\1\0\2\263\1\0\2\263"+
"\2\0\5\263\7\0\15\263\3\0\4\263\27\0\1\267"+
"\53\0\1\270\24\0";
"\1\20\1\21\1\22\1\23\3\13\1\24\2\13\15\17"+
"\1\25\2\13\3\17\1\13\7\26\1\27\5\26\4\30"+
"\5\26\1\31\1\26\15\30\3\26\3\30\10\26\1\27"+
"\5\26\4\32\5\26\1\33\1\26\15\32\3\26\3\32"+
"\1\26\7\34\1\35\5\34\4\36\1\34\1\37\2\26"+
"\1\34\1\40\1\34\15\36\3\34\1\41\2\36\2\34"+
"\1\42\5\34\1\35\5\34\4\43\4\34\1\44\2\34"+
"\15\43\3\34\3\43\10\34\1\35\5\34\4\45\4\34"+
"\1\44\2\34\15\45\3\34\3\45\10\34\1\35\5\34"+
"\4\45\4\34\1\46\2\34\15\45\3\34\3\45\10\34"+
"\1\35\1\34\1\47\3\34\4\50\7\34\15\50\3\34"+
"\3\50\10\34\1\51\5\34\4\52\7\34\15\52\1\34"+
"\1\53\1\34\3\52\1\34\1\54\1\55\5\54\1\56"+
"\1\54\1\57\3\54\4\60\4\54\1\61\2\54\15\60"+
"\2\54\1\62\3\60\1\54\55\0\1\63\62\0\1\64"+
"\4\0\4\65\7\0\6\65\1\66\6\65\3\0\3\65"+
"\12\0\1\67\43\0\1\70\1\71\1\72\1\73\2\74"+
"\1\0\1\75\3\0\1\75\1\17\1\20\1\21\1\22"+
"\7\0\15\17\3\0\3\17\3\0\1\76\1\0\1\77"+
"\2\100\1\0\1\101\3\0\1\101\3\20\1\22\7\0"+
"\15\20\3\0\3\20\2\0\1\70\1\102\1\72\1\73"+
"\2\100\1\0\1\101\3\0\1\101\1\21\1\20\1\21"+
"\1\22\7\0\15\21\3\0\3\21\3\0\1\103\1\0"+
"\1\77\2\74\1\0\1\75\3\0\1\75\4\22\7\0"+
"\15\22\3\0\3\22\26\0\1\104\73\0\1\105\16\0"+
"\1\64\4\0\4\65\7\0\15\65\3\0\3\65\16\0"+
"\4\30\7\0\15\30\3\0\3\30\27\0\1\106\42\0"+
"\4\32\7\0\15\32\3\0\3\32\27\0\1\107\42\0"+
"\4\36\7\0\15\36\3\0\3\36\24\0\1\26\45\0"+
"\4\36\7\0\2\36\1\110\12\36\3\0\3\36\2\0"+
"\1\111\67\0\4\43\7\0\15\43\3\0\3\43\26\0"+
"\1\112\43\0\4\45\7\0\15\45\3\0\3\45\26\0"+
"\1\113\37\0\1\114\57\0\4\50\7\0\15\50\3\0"+
"\3\50\11\0\1\115\4\0\4\65\7\0\15\65\3\0"+
"\3\65\16\0\4\52\7\0\15\52\3\0\3\52\47\0"+
"\1\114\6\0\1\116\63\0\1\117\57\0\4\60\7\0"+
"\15\60\3\0\3\60\26\0\1\120\43\0\4\65\7\0"+
"\15\65\3\0\3\65\14\0\1\34\1\0\4\121\1\0"+
"\3\122\3\0\15\121\3\0\3\121\14\0\1\34\1\0"+
"\4\121\1\0\3\122\3\0\3\121\1\123\11\121\3\0"+
"\3\121\16\0\1\124\1\0\1\124\10\0\15\124\3\0"+
"\3\124\16\0\1\125\1\126\1\127\1\130\7\0\15\125"+
"\3\0\3\125\16\0\1\131\1\0\1\131\10\0\15\131"+
"\3\0\3\131\16\0\1\132\1\133\1\132\1\133\7\0"+
"\15\132\3\0\3\132\16\0\1\134\2\135\1\136\7\0"+
"\15\134\3\0\3\134\16\0\1\75\2\137\10\0\15\75"+
"\3\0\3\75\16\0\1\140\2\141\1\142\7\0\15\140"+
"\3\0\3\140\16\0\4\133\7\0\15\133\3\0\3\133"+
"\16\0\1\143\2\144\1\145\7\0\15\143\3\0\3\143"+
"\16\0\1\146\2\147\1\150\7\0\15\146\3\0\3\146"+
"\16\0\1\151\1\141\1\152\1\142\7\0\15\151\3\0"+
"\3\151\16\0\1\153\2\126\1\130\7\0\15\153\3\0"+
"\3\153\30\0\1\154\1\155\64\0\1\156\27\0\4\36"+
"\7\0\2\36\1\157\12\36\3\0\3\36\2\0\1\160"+
"\101\0\1\161\1\162\40\0\4\65\7\0\6\65\1\163"+
"\6\65\3\0\3\65\2\0\1\164\63\0\1\165\71\0"+
"\1\166\1\167\34\0\1\170\1\0\1\34\1\0\4\121"+
"\1\0\3\122\3\0\15\121\3\0\3\121\16\0\4\171"+
"\1\0\3\122\3\0\15\171\3\0\3\171\12\0\1\170"+
"\1\0\1\34\1\0\4\121\1\0\3\122\3\0\10\121"+
"\1\172\4\121\3\0\3\121\2\0\1\70\13\0\1\124"+
"\1\0\1\124\10\0\15\124\3\0\3\124\3\0\1\173"+
"\1\0\1\77\2\174\6\0\1\125\1\126\1\127\1\130"+
"\7\0\15\125\3\0\3\125\3\0\1\175\1\0\1\77"+
"\2\176\1\0\1\177\3\0\1\177\3\126\1\130\7\0"+
"\15\126\3\0\3\126\3\0\1\200\1\0\1\77\2\176"+
"\1\0\1\177\3\0\1\177\1\127\1\126\1\127\1\130"+
"\7\0\15\127\3\0\3\127\3\0\1\201\1\0\1\77"+
"\2\174\6\0\4\130\7\0\15\130\3\0\3\130\3\0"+
"\1\202\2\0\1\202\7\0\1\132\1\133\1\132\1\133"+
"\7\0\15\132\3\0\3\132\3\0\1\202\2\0\1\202"+
"\7\0\4\133\7\0\15\133\3\0\3\133\3\0\1\174"+
"\1\0\1\77\2\174\6\0\1\134\2\135\1\136\7\0"+
"\15\134\3\0\3\134\3\0\1\176\1\0\1\77\2\176"+
"\1\0\1\177\3\0\1\177\3\135\1\136\7\0\15\135"+
"\3\0\3\135\3\0\1\174\1\0\1\77\2\174\6\0"+
"\4\136\7\0\15\136\3\0\3\136\3\0\1\177\2\0"+
"\2\177\1\0\1\177\3\0\1\177\3\137\10\0\15\137"+
"\3\0\3\137\3\0\1\103\1\0\1\77\2\74\1\0"+
"\1\75\3\0\1\75\1\140\2\141\1\142\7\0\15\140"+
"\3\0\3\140\3\0\1\76\1\0\1\77\2\100\1\0"+
"\1\101\3\0\1\101\3\141\1\142\7\0\15\141\3\0"+
"\3\141\3\0\1\103\1\0\1\77\2\74\1\0\1\75"+
"\3\0\1\75\4\142\7\0\15\142\3\0\3\142\3\0"+
"\1\74\1\0\1\77\2\74\1\0\1\75\3\0\1\75"+
"\1\143\2\144\1\145\7\0\15\143\3\0\3\143\3\0"+
"\1\100\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
"\3\144\1\145\7\0\15\144\3\0\3\144\3\0\1\74"+
"\1\0\1\77\2\74\1\0\1\75\3\0\1\75\4\145"+
"\7\0\15\145\3\0\3\145\3\0\1\75\2\0\2\75"+
"\1\0\1\75\3\0\1\75\1\146\2\147\1\150\7\0"+
"\15\146\3\0\3\146\3\0\1\101\2\0\2\101\1\0"+
"\1\101\3\0\1\101\3\147\1\150\7\0\15\147\3\0"+
"\3\147\3\0\1\75\2\0\2\75\1\0\1\75\3\0"+
"\1\75\4\150\7\0\15\150\3\0\3\150\3\0\1\203"+
"\1\0\1\77\2\74\1\0\1\75\3\0\1\75\1\151"+
"\1\141\1\152\1\142\7\0\15\151\3\0\3\151\3\0"+
"\1\204\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
"\1\152\1\141\1\152\1\142\7\0\15\152\3\0\3\152"+
"\3\0\1\201\1\0\1\77\2\174\6\0\1\153\2\126"+
"\1\130\7\0\15\153\3\0\3\153\31\0\1\155\54\0"+
"\1\205\64\0\1\206\26\0\4\36\7\0\15\36\3\0"+
"\1\36\1\207\1\36\31\0\1\162\54\0\1\210\35\0"+
"\1\34\1\0\4\121\1\0\3\122\3\0\3\121\1\211"+
"\11\121\3\0\3\121\2\0\1\212\102\0\1\167\54\0"+
"\1\213\34\0\1\214\52\0\1\170\3\0\4\171\7\0"+
"\15\171\3\0\3\171\12\0\1\170\1\0\1\215\1\0"+
"\4\121\1\0\3\122\3\0\15\121\3\0\3\121\16\0"+
"\1\216\1\130\1\216\1\130\7\0\15\216\3\0\3\216"+
"\16\0\4\136\7\0\15\136\3\0\3\136\16\0\4\142"+
"\7\0\15\142\3\0\3\142\16\0\4\145\7\0\15\145"+
"\3\0\3\145\16\0\4\150\7\0\15\150\3\0\3\150"+
"\16\0\1\217\1\142\1\217\1\142\7\0\15\217\3\0"+
"\3\217\16\0\4\130\7\0\15\130\3\0\3\130\16\0"+
"\4\220\7\0\15\220\3\0\3\220\33\0\1\221\61\0"+
"\1\222\30\0\4\36\6\0\1\223\15\36\3\0\2\36"+
"\1\224\33\0\1\225\32\0\1\170\1\0\1\34\1\0"+
"\4\121\1\0\3\122\3\0\10\121\1\226\4\121\3\0"+
"\3\121\2\0\1\227\104\0\1\230\36\0\4\231\7\0"+
"\15\231\3\0\3\231\3\0\1\173\1\0\1\77\2\174"+
"\6\0\1\216\1\130\1\216\1\130\7\0\15\216\3\0"+
"\3\216\3\0\1\203\1\0\1\77\2\74\1\0\1\75"+
"\3\0\1\75\1\217\1\142\1\217\1\142\7\0\15\217"+
"\3\0\3\217\3\0\1\202\2\0\1\202\7\0\4\220"+
"\7\0\15\220\3\0\3\220\34\0\1\232\55\0\1\233"+
"\26\0\1\234\60\0\4\36\6\0\1\223\15\36\3\0"+
"\3\36\34\0\1\235\31\0\1\170\1\0\1\114\1\0"+
"\4\121\1\0\3\122\3\0\15\121\3\0\3\121\34\0"+
"\1\236\32\0\1\237\2\0\4\231\7\0\15\231\3\0"+
"\3\231\35\0\1\240\62\0\1\241\20\0\1\242\77\0"+
"\1\243\53\0\1\244\32\0\1\34\1\0\4\171\1\0"+
"\3\122\3\0\15\171\3\0\3\171\36\0\1\245\53\0"+
"\1\246\33\0\4\247\7\0\15\247\3\0\3\247\36\0"+
"\1\250\53\0\1\251\54\0\1\252\61\0\1\253\11\0"+
"\1\254\12\0\4\247\7\0\15\247\3\0\3\247\37\0"+
"\1\255\53\0\1\256\54\0\1\257\22\0\1\13\62\0"+
"\4\260\7\0\15\260\3\0\3\260\40\0\1\261\53\0"+
"\1\262\43\0\1\263\26\0\2\260\1\0\2\260\1\0"+
"\2\260\2\0\5\260\7\0\15\260\3\0\4\260\27\0"+
"\1\264\53\0\1\265\24\0";
private static int [] zzUnpackTrans() {
int [] result = new int[7040];
int [] result = new int[6908];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -355,8 +351,8 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
"\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
"\12\0\1\11\7\1\1\11\2\1\1\11\5\1\1\11"+
"\3\1\1\11\13\1\1\11\5\1\2\11\3\0\1\11"+
"\14\0\2\1\2\11\1\1\1\0\2\1\1\11\1\0"+
"\1\1\1\0\1\1\3\0\7\1\2\0\1\1\1\0"+
"\15\1\3\0\1\1\1\11\3\0\1\1\1\11\5\0"+
@ -365,7 +361,7 @@ class WikipediaTokenizerImpl {
"\2\0\3\11";
private static int [] zzUnpackAttribute() {
int [] result = new int[184];
int [] result = new int[181];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -508,7 +504,6 @@ final void reset() {
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
@ -517,7 +512,6 @@ final void reset() {
}
/**
* Unpacks the compressed character translation table.
*

View File

@ -212,7 +212,7 @@ DOUBLE_EQUALS = "="{2}
{DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
{CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
[^] |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
}
<INTERNAL_LINK_STATE>{
@ -221,7 +221,7 @@ DOUBLE_EQUALS = "="{2}
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
[^] { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
}
<EXTERNAL_LINK_STATE>{
@ -236,7 +236,7 @@ DOUBLE_EQUALS = "="{2}
{ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
[^] { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
}
//italics
<TWO_SINGLE_QUOTES_STATE>{
@ -249,7 +249,7 @@ DOUBLE_EQUALS = "="{2}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
//bold
<THREE_SINGLE_QUOTES_STATE>{
@ -260,7 +260,7 @@ DOUBLE_EQUALS = "="{2}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
//bold italics
@ -272,7 +272,7 @@ DOUBLE_EQUALS = "="{2}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
<DOUBLE_EQUALS_STATE>{
@ -280,15 +280,15 @@ DOUBLE_EQUALS = "="{2}
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
<DOUBLE_BRACE_STATE>{
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
{CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
//ignore
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
<STRING> {
@ -305,7 +305,7 @@ DOUBLE_EQUALS = "="{2}
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
.|{WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
}
@ -327,7 +327,7 @@ DOUBLE_EQUALS = "="{2}
//end wikipedia
/** Ignore the rest */
. | {WHITESPACE}|{TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] | {TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}

View File

@ -202,7 +202,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
wordBreakTest.test(a);
}
@ -231,6 +231,8 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);

View File

@ -424,7 +424,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
wordBreakTest.test(a);
}

View File

@ -78,13 +78,13 @@ LTLNFsgB@[191.56.104.113]
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU
VGLn@z3E2.3an2.MM
TWmfsxn@[112.192.017.029]
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D
CjaPC63@['\RDrwk]
Ayydpdoa@tdgypppmen.wf
"gfKP9"@jo3-r0.mz
aTMgDW4@t5gax.XN--0ZWM56D
aTMgDW4@t5gax.XN--3E0B707E
mcDrMO3FQ@nwc21.y5qd45lesryrp.IL
NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp
NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d
XtAhFnq@[218.214.251.103]
x0S8uos@[109.82.126.233]
ALB4KFavj16pODdd@i206d6s.MM

View File

@ -78,9 +78,10 @@ import org.junit.Ignore;
* \\p{Script = Hiragana}
* \\p{LineBreak = Complex_Context} (From $line_break_url)
* \\p{WordBreak = ALetter} (From $word_break_url)
* \\p{WordBreak = Hebrew_Letter}
* \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {
@ -97,7 +98,7 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints,
{'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@ -109,25 +110,33 @@ print STDERR "Writing '$output_path'...";
print OUT $header;
for my $line (@tests) {
next if ($line =~ /^\s*\#/);
# ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
# Example line: ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
my ($sequence) = $line =~ /^(.*?)\s*\#/;
$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
print OUT " // $line\n";
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
my $test_string = $sequence;
$test_string =~ s/\s*÷\s*/\\u/g;
$test_string =~ s/\s*×\s*/\\u/g;
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
$test_string =~ s/\\u000A/\\n/g;
$test_string =~ s/\\u000D/\\r/g;
$test_string =~ s/\\u0022/\\\"/g;
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
my @tokens = ();
for my $candidate (split /\s*÷\s*/, $sequence) {
my @chars = ();
my $has_wanted_char = 0;
while ($candidate =~ /([0-9A-F]+)/gi) {
push @chars, $1;
my $hexchar = $1;
if (4 == length($hexchar)) {
push @chars, $hexchar;
} else {
push @chars, above_BMP_char_to_surrogates($hexchar);
}
unless ($has_wanted_char) {
$has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
$has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
}
}
if ($has_wanted_char) {
@ -144,6 +153,21 @@ close OUT;
print STDERR "done.\n";
# sub above_BMP_char_to_surrogates
#
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
# to the corresponding UTF-16 surrogate pair
#
# Assumption: input string is a sequence more than four hex digits
#
sub above_BMP_char_to_surrogates {
my $ch = hex(shift);
my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
}
# sub parse_Unicode_data_file
#
# Downloads and parses the specified Unicode data file, parses it, and

View File

@ -121,14 +121,14 @@ Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them "0\!P?".shQVdSerA@2qmqj8ul.hm the leg
of LTLNFsgB@[191.56.104.113] all, until it has read it is
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU there. <VGLn@z3E2.3an2.MM> Once
TWmfsxn@[112.192.017.029] Spiros under the place
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV as were not a house of the
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D as were not a house of the
rosebushes and the whateverend, feel her waist. She changes everything. We had
decided to do you know CjaPC63@['\RDrwk] this, is what did leave, pray; let us
come to, <Ayydpdoa@tdgypppmen.wf> what history as died. Strange, Spiros with
delight: That night "gfKP9"@jo3-r0.mz and gold case
<aTMgDW4@t5gax.XN--0ZWM56D> is spring: the aeon arising, wherein he returned,
<aTMgDW4@t5gax.XN--3E0B707E> is spring: the aeon arising, wherein he returned,
retraversing the mcDrMO3FQ@nwc21.y5qd45lesryrp.IL gates, first
<NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp> to reach session. Initiating first
<NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d> to reach session. Initiating first
part of the main hall toward his own spurs. Hes an <XtAhFnq@[218.214.251.103]>
Irifix And older ones who wins? ADAM: x0S8uos@[109.82.126.233] The violin and
reality. The hidden set up to come. ROSE WAKINS: No answer. The

View File

@ -24,7 +24,7 @@ and Joe recited this iron bars with their account, poor elth, and she had been
almost drove me towards evening. At
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH the
sergeant and then on the raw
<Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m> afternoon towards
<Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m> afternoon towards
the terror, merely wished him as biled
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb -- a conciliatory air on in
<ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J>
@ -47,7 +47,7 @@ to live. You didn't know nothing could attend more.' He had been a coming! Get
behind the answer those aids, I saw him in the same appearance of the convict's
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
confession, and bring you see? '
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
accusatory manner as well known that Joe Gargery marry her cup. `I wonder and
there was publicly made it was,
<file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#> as lookers on; me, I
@ -63,7 +63,7 @@ again
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
towards evening. At last, and kneaded, and a dead man taking any. There was
publicly made out there?' said I,
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
glancing http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY at the
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/ river wound, twenty miles of the
number called, hears the awful it lights; here and trimmings of Caesar. This
@ -155,7 +155,7 @@ ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sg
at me, and that her walking z3ymb.KM/DdnrqoBz=YtxSB away so much of the
grievous circumstances foreshadowed. After receiving the way, that I thought,
if she should go to?' `Good again!' cried the
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0 society of a savoury pork pie,
FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0 society of a savoury pork pie,
and nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc they challenged, hears nothin' all my
hands in herself, and bring him by hand. `This,' ftp://085.062.055.011/bopfVV/
said he wore ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs a dog of
@ -191,7 +191,7 @@ and tingling, and that I had won of the shoulder. `Excuse me, and we departed
from Richard the furthest end of
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w both imp and stung by the
bright fire, another look
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
best use asking questions, and feet,
<ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ> hanging to try
back was the poker. `It was not warmly. `Seems
@ -204,7 +204,7 @@ kitchen wall,
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1 he ate the
house, end with the Ghost in order): Forty-three pence?' To five hundred
Gargerys.' `I say, Pip; stay
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
his shot, and reposing no help to my seat. It was in the kitchen wall, because
I calculated the sounds by giving me by the name for a rush of Joe's forge
@ -299,7 +299,7 @@ She drew the kitchen, carrying file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH so low
wooden hut
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
where it seemed to give Pirrip as
<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
to say, on the guiltily coarse his head, he tried to the
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
remark. `There's one sprinkled all I was possible she beggared me. All these
@ -311,7 +311,7 @@ Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%be
he shook her veil so thick nor my milk and would impart all had returned, with
soap-suds, I had FILE:///#F9Bgl just like thin snow. `Enough of his right side
of thenceforth sitting
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
in File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg my soul. I sat down on it, I have
been a spoon that the pie, blacksmith?' asked Estella of it made a mouth wide
open, and so
@ -324,7 +324,7 @@ FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2 of the stranger looked at it, I
pointed to Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz himself. No glimpse of
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg herself, I saw that he would have
been there, I was too far and uncomfortable by it.
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
Under the Above,' I rather to become transfixed -- he gave me out of the
kitchen empty-handed, to keep him, I had made a
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG subject, if he had
@ -468,7 +468,7 @@ hard twist upon his -- `Well, boy,' Uncle Pumblechook: a look at the sermon he
had heard it had hesitated as little window, violently plunging and she had
committed, and had all about the present calling, which the fingers of tea on
Saturdays than this country, gentlemen, but I could see those,
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
too, if you remember what stock she told me again. `But I know what
file:///enqvF%EFLOBsZhl8h2z wittles is?' `Yes, ma'am.' `Estella, take me again
and ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A refractory
@ -493,7 +493,7 @@ right-side
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
flaxen curls and tables, and a foot of the blacksmith's.' `Halloa!' said Joe,
staring at that it had withered like a infunt, and took another look about the
rum <6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
rum <6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
Three Jolly Bargemen to think she seemed to tell you were. When we saw the file
coming at my slice. I have mentioned it with the wooden hut where we had got up
trying to file:///gVW/nnRNxPfMXKb%72Aq%4A hand. If ever grateful for. If a
@ -662,7 +662,7 @@ open,' he
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
wiped the liquor. He was the bad; and some one
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE another
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
down by a most powerfully down
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x to me, and all that
know the window,
@ -993,7 +993,7 @@ upon a door, which was gobbling mincemeat, meatbone, bread, some lace for it
that Joe's blue file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/ eyes, had an
hour longer than at me, and dismal, and gloves, and that's further than I
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs looked on. `Now, boy!
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
Why, here's a ridiculous old chap. And looked up by hand. `Why don't like
`sulks.' Therefore, I was in such game?' Everybody, myself drifting down his
chest and he had made me worse by-and-by. I was a
@ -1035,7 +1035,7 @@ in every word out again. `You are prison-ships, and they fought
<HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt>
for us heavy. `I Bolted, myself, 5.Piba4ac.JE/55M1H/AZXdj and thread, and we
after him, or to inspire confidence. This was brought you spoke all the act, he
couldn't m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
couldn't m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
between the forge was <ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/>
busy in it. Until
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/ she jammed
@ -1329,7 +1329,7 @@ sort Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L --
FILE://155.24.106.255/3VEZIT7 if it was to him, I might not do not afraid of
report, and looking rather to make nothing of a confidential voice,
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
as lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
as lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
supposed,' said the wind and so we were read the conversation consisted of it
had so that we saw some bread, some
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C brandy out: no black velvet

View File

@ -10,7 +10,7 @@ http://Rcbu6/Oxc%C0IkGSZ8rO9IUpd/BEvkvw3nWNXZ/P%17tp3gjATN/0ZRzs
file:///2CdsP/U2GCLT
Http://Pzw978uzb.ai/yB;mt/o8hVKG/%231Y/Xb1%bb6v1fhjfdkfkBvxed?8mq~=OvF&STpJJk=ws0ZO&0DRA=
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH
Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m
Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb
ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J
ftp://213.7.210.47/%e5pFkj6e6Jczc/ypJGG/z%663jYR/37IxLQBPr/Ciq50EUIdueyj
@ -23,13 +23,13 @@ Ftp://Xmswrxn8d-1s.pe.gm/dB6C3xTk%D3x/EKOiTmk%7c/API/0cdgpi;Type=a
FILE:///rKnQkS0MAF#tM%53_2%03%d6ZICH
ftp://R5ecjkf1yx4wpskfh.tv0y3m90ak.0R605.se:51297/zpWcRRcG/1woSqw7ZUko/
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND
HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND
file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#
http://1qvgjd1.TP/7oq5gWW/Gwqf8fxBXR4/?Br,q=ayMz0&1IO%370N7=;Sl1czc2L+5bRISfD+w&ygP3FhV%E1w36=2Rx
ftp://5SCC6BUYP.Knf1cvlc22z9.1dc3rixt5ugyq4/5OnYTSN/QpCdo/t3zqkI/pn5skT/oJgrGy7
http://2dkbeuwsto3i3e8jaxi6su9wjlmwygtpdp7g65611z-2bbr82uhjqkdv2jrh7.KZ/FiSvI/aaB&dPQ%42kLdM
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/
http://ah-2d4.ASIA/qmp
@ -75,7 +75,7 @@ http://4u3o/BKdhwRyzG
file:///LdsHfPABFz1vRD1OB6Yl/RS6&1Gmz/mfYul/
ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sgn6&X5EiZdZ0WhTX3T/fa%f3Azz
z3ymb.KM/DdnrqoBz=YtxSB
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0
FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0
nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc
ftp://085.062.055.011/bopfVV/
ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs
@ -93,12 +93,12 @@ https://[3790:ad57:0B63::e5f7:f6ac:164C]/Obax;zcD/Y%48%9a/Z2xcdar
bl60k0jqkc9.oow84o1.BF/Xly5cTna/BzoQuHi3r8e/o5BDNrvT/=6HRdBjH/Mrp5%02/p%e9pT2Ae
ftp://Bs3ceuxd8ii66gt.X8wwdpt.BB:27095/3BfkvfzcmTS/FTffh&S/gIWvJ5Kd/AlOQ%3EnO
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ
HTTPS://56aderic0knmip9lkqdqag14.uk:45885/lELiK:/vF%4C5Enwqy/P5NGJ2b/dD6sg1yMV
ftp://vlt.3g45k63viz2.tcnm3.UA:60664/AJ9iqYk%c1/uKbohn2/K%D1kequ4z8rxFpJ
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
ftp://lv56pdepzu0b0fo-04qtxv5tt2jc0nsaukrhtz5-e3u1vcb517y3b135zl.e0r1hson.dk/3TVoqjp6%1FCFSkt/006VZfho/gxrWxgDawM3Uk
Ftp://7n977.Niyt.2fgkzfhj.q7-DJ.Ow7a.it/5zfRi3PO8/1zfKT9%421tP/?SazEijJq%710COQKWeLE/TdUc%b2u/2AxBw9%4BUN6Zp4Z/KfUZd1MTdPv/L4m1tI3/WJvcK1
@ -147,20 +147,20 @@ ftp://Lq.es/%B1ZPdTZgB2mNFW/qre92rM
file:///IZ47ESCtX%aatQab1/V553gjR?Me/#9%68qPw
file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
ftp://[fd77:4982:C37F:a0a1:7651:E09C:117.093.145.017]/2l91g/s%79lJmUiZ/%A5R2qsJ
[62c0::]/d1lmSzoB/5OBVnzn/kOXW%D23
Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%bed=uY5hO+s+IKk1S&Q=HHXEC+Gof86QIRHy&35QY5=
FILE:///#F9Bgl
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg
ftp://892f7.oel50j.32.9qj1p-g7lgw.MR:48021/XNKbk2PZQXSvOuGnOAnATDt3/XfHyJtvoC/PW7YrSgf#LmGWJgPw
http://sisas.ua/4CU60ZLK4VgY8AR89
FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG
ftp://tw7d-6yu.im:2055/%66qbqzss/OmPGW;type=d
FTP://zst.tn/QcUpaA/VKvJ2/JN6AKew/iXYIiHm7mfPFmD%21E5/yTQpoiqdbaaS1/LnzOX#VqsobH
@ -228,7 +228,7 @@ file:///UIIGOxv6jvF2%c0/%A8J3%677Gmq8im1zklKhqx/HMhCSY2QcyxvL/
http://Qhk9z.zm/cOGBen/mBsDycEI5V7L1s%84WUj7863/p%5f~okuRD51b0M?b%F2d%67ujGr=oh8PWUtK&j6uX7baX=&sg3RUocA9W=m5IaF&JWH9G=fyiOtnC3+7RJA+ippw96rvu+BxtGg&F6f1=jmPS&3PE0xX5=TGV%5c5J&%fc@NSEynhuvb=&MkRIt33=
Http://[98cc:433d:2C25:62dd:54ba:d10b:63d3:4C40]/YlbNrJod/fdjuN/qYqSdqr5/KAbXYHO%F0m7Ws9
file:///ywFY5HK/XAv@v%66o/M2O4Wlny50hypf5%02A8
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
file:///enqvF%EFLOBsZhl8h2z
ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A
ftp://1xf.ipl4f0y6c4.VA/LHuq~/p2nPbE/0YGGNJB%DEje2psef_B/aKOuMl1Q9
@ -240,7 +240,7 @@ http://nEN5ZN.EG/%0efsf4v30L
file:///19%9947/ksd3Sq7W78%27/2K_Ylzcu2q
r8sht9qzsc1e2wp.ci/8SbPwlW%5ac/qKEqFi0Q
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/
6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/
file:///gVW/nnRNxPfMXKb%72Aq%4A
file:///Fzza388TQ
file:///
@ -314,7 +314,7 @@ file:///3%aexrb7UdZ5GpR4ZIfoxwL/vQV%4a2zQxki/QRji6gHpMGgBaM/d%71A2CTpZv-kF0tD/Ig
f5ms.jp/%A1FpERWwTd%BFG/ExC8V5aqx5l2CLJr0mJb5u/DgMvEzAr2U/py9Vg/igr9PzANtw/FFiN1E7
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9
Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x
ftp://D02-auxxaeqnv9ve-jlmo3.l10vqu.12jl.2mvjwrsqm.BA/r71QLLNu6oGJjG/HbxrX1Grq8/QR%2agZv4hR
file:///XoCg%EDVf/A3ibJYjU
@ -476,7 +476,7 @@ ftp://53.151.134.240/uZqGXLUIu-J/=%0C2pO/PvL0%19MpQBv/
FILE:///Kywof5D5q/0TRS/zayrkrnENB
file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
file:///TJa%86AczeCmM5QMhi/Wox~Ajl/WxUF%5eSA:y%0fD%E21/x%cca%d3Qgx/8iWJ5-h%26/fCK%01nQNrK8#ygTTB
file:///~%303cUUVYTEaQU5%5DXbogiPKb/favR2rETEh/9TXM%15u/nYCOZpZgL
file:///mJM%a1/jv5%53QDqE/bFMu0CBp
@ -496,7 +496,7 @@ http://gpu16lz.LS/9e%daJrwQfHEpFvsZ3jx/c4STIJ/CmvEGAUx9f/
file://ij9anjtok86ro.uN-BGDQ855IB.sDXAQR.5kr8kz.3J3M8XRM.18r3s0g-6.4rjsmwue0lwao0og17d-5-1.F1h3qgkul29yw2t4p4se5clomncxhmoy.g6c9tbz7.pa/5LMtmbl/1tfIF/pBOV7Hc
HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt
5.Piba4ac.JE/55M1H/AZXdj
m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/
m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/
ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/
Ftp://mez27g2tpmk.MC/%B8AHk%95etDns%46/gXbsCn%6C-/s8_Jmy/DhmfT~Di6KD
@ -633,7 +633,7 @@ http://047.014.184.200/Z_QdOwjzfBue4Nt/aEn/xuEQD/cXlnoxHIK%7d8h/1%eegEk7E0/8Ejku
Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L
FILE://155.24.106.255/3VEZIT7
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET
lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C
FILE://a6ys9a4.xj.BY/%99BGXp/F=yJtxc71/gvXuHuB9k
212.072.006.032/6kV8ce%2e/%e7lzm-HB%4artP/zg6tWMW7RIG?U7=HAXw$D3sM%7DyDJ&Gt=

View File

@ -75,7 +75,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
+ " samba Halta gamba "
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+ "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m"
+ " inter Locutio "
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
@ -91,7 +91,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
"samba", "Halta", "gamba",
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
"Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m",
"inter", "Locutio",
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",

View File

@ -60,20 +60,21 @@ public class GenerateJflexTLDMacros {
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2001-2005 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
+ " * this work for additional information regarding copyright ownership." + NL
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
+ " * the License. You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL;
private static final Pattern TLD_PATTERN_1
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");

View File

@ -38,38 +38,43 @@ public class GenerateJFlexSupplementaryMacros {
}
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2010 The Apache Software Foundation." + NL
= "/*" + NL
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
+ " * this work for additional information regarding copyright ownership." + NL
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
+ " * the License. You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
+ " */" + NL;
public static void main(String args[]) {
outputHeader();
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
outputMacro("FormatSupp", "[:WordBreak=Format:]");
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
outputMacro("HanSupp", "[:Script=Han:]");
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
outputMacro("FormatSupp", "[:WordBreak=Format:]");
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
outputMacro("HanSupp", "[:Script=Han:]");
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
outputMacro("SingleQuoteSupp", "[:WordBreak=Single_Quote:]");
outputMacro("DoubleQuoteSupp", "[:WordBreak=Double_Quote:]");
outputMacro("HebrewLetterSupp", "[:WordBreak=Hebrew_Letter:]");
outputMacro("RegionalIndicatorSupp", "[:WordBreak=Regional_Indicator:]");
}
static void outputHeader() {

View File

@ -476,7 +476,7 @@
<available property="jflex.present" classname="jflex.anttask.JFlexTask">
<classpath refid="jflex.classpath"/>
</available>
<fail unless="jflex.present">
<fail unless="jflex.present">&#xA0;
##################################################################
JFlex not found.
JFlex Home: ${jflex.home}
@ -484,14 +484,14 @@
Please install the jFlex 1.5 version (currently not released)
from its SVN repository:
svn co -r 623 http://jflex.svn.sourceforge.net/svnroot/jflex/trunk jflex
svn co -r 722 https://svn.code.sf.net/p/jflex/code/trunk jflex
cd jflex
mvn install
Then, create a build.properties file either in your home
directory, or within the Lucene directory and set the jflex.home
property to the path where the JFlex trunk checkout is located
(in the above example its the directory called "jflex").
(in the above example it's the directory called "jflex").
##################################################################
</fail>