LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043071 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-12-07 14:53:13 +00:00
parent 5b2e0f786b
commit 2b9726ae81
19 changed files with 3560 additions and 3461 deletions

View File

@ -9,14 +9,16 @@ API Changes
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
* LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
as well as tokenizing URLs and email addresses according to the relevant
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in
common/standard/ now implement the Word Break rules from the Unicode 6.0.0
Text Segmentation algorithm (UAX#29).
* LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
(Steven Rowe)
ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
implementation and behavior.
UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
(Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)

View File

@ -38,7 +38,7 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@ -62,11 +62,11 @@
nobak="on" />
</target>
<target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
</target>

View File

@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
// file version from Saturday, December 4, 2010 12:34:19 PM UTC
// generated on Sunday, December 5, 2010 12:24:12 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 10/3/10 9:07 AM from the specification file
* <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
* on 12/4/10 7:24 PM from the specification file
* <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -630,6 +630,12 @@ public final void getText(CharTermAttribute t) {
zzState = ZZ_LEXSTATE[zzLexicalState];
// set up zzAction for empty match case:
int zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
}
zzForAction: {
while (true) {
@ -662,7 +668,7 @@ public final void getText(CharTermAttribute t) {
if (zzNext == -1) break zzForAction;
zzState = zzNext;
int zzAttributes = zzAttrL[zzState];
zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
@ -676,45 +682,45 @@ public final void getText(CharTermAttribute t) {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 10:
{ return EMAIL;
}
case 11: break;
case 2:
{ return ALPHANUM;
}
case 12: break;
case 4:
{ return HOST;
}
case 13: break;
case 1:
{ /* ignore */
}
case 14: break;
case 8:
{ return ACRONYM_DEP;
}
case 15: break;
case 5:
{ return NUM;
}
case 16: break;
case 11: break;
case 9:
{ return ACRONYM;
}
case 17: break;
case 12: break;
case 7:
{ return COMPANY;
}
case 18: break;
case 13: break;
case 10:
{ return EMAIL;
}
case 14: break;
case 1:
{ /* ignore */
}
case 15: break;
case 6:
{ return APOSTROPHE;
}
case 19: break;
case 16: break;
case 3:
{ return CJ;
}
case 17: break;
case 8:
{ return ACRONYM_DEP;
}
case 18: break;
case 2:
{ return ALPHANUM;
}
case 19: break;
case 4:
{ return HOST;
}
case 20: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

View File

@ -16,6 +16,6 @@
*/
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
SVN revision 597) at the moment!
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
and need to regenerate the tokenizer, only use the trunk version
of JFlex 1.5 (with a minimum SVN revision 597) at the moment!

View File

@ -83,10 +83,9 @@ public final class StandardTokenizer extends Tokenizer {
@Deprecated
public static final int ACRONYM_DEP = 8;
public static final int URL = 9;
public static final int SOUTHEAST_ASIAN = 10;
public static final int IDEOGRAPHIC = 11;
public static final int HIRAGANA = 12;
public static final int SOUTHEAST_ASIAN = 9;
public static final int IDEOGRAPHIC = 10;
public static final int HIRAGANA = 11;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
@ -99,7 +98,6 @@ public final class StandardTokenizer extends Tokenizer {
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>",
"<URL>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>"

View File

@ -23,14 +23,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -67,83 +64,6 @@ MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
@ -151,12 +71,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
/** Numbers */
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final int URL_TYPE = StandardTokenizer.URL;
/** E-mail addresses */
public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
@ -191,9 +105,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric

View File

@ -1,847 +0,0 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
* characters (characters above the Basic Multilingual Plane, which contains
* those up to and including U+FFFF), this scanner will not recognize them
* properly. If you need to be able to process text containing supplementary
* characters, consider using the ICU4J-backed implementation in modules/analysis/icu
* (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
* instead of this class, since the ICU4J-backed implementation does not have
* this limitation.
*/
public final class UAX29Tokenizer extends Tokenizer {
/** This character denotes the end of file */
private static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
private static final int YYINITIAL = 0;
/**
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
* ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
* at the beginning of a line
* l is of the form l = 2*k, k a non negative integer
*/
private static final int ZZ_LEXSTATE[] = {
0, 0
};
/**
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
"\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
"\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
"\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
"\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
"\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
"\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
"\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
"\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
"\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
"\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
"\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
"\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
"\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
"\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
"\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
"\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
"\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
"\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
"\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
"\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
"\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
"\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
"\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
"\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
"\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
"\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
"\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
"\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
"\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
"\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
"\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
"\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
"\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
"\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
"\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
"\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
"\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
"\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
"\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
"\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
"\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
"\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
"\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
"\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
"\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
"\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
"\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
"\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
"\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
"\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
"\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
"\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
"\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
"\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
"\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
"\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
"\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
"\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
"\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
"\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
"\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
"\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
"\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
"\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
"\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
"\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
"\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
"\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
"\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
"\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
"\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
"\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
"\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
"\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
"\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
"\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
"\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
"\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
"\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
"\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
"\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
"\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
"\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
"\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
"\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
"\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
"\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
"\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
"\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
"\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
"\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
"\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
"\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
"\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
"\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
"\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
"\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
"\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
"\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
/**
* Translates characters to character classes
*/
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
/**
* Translates DFA states to action switch labels.
*/
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
"\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
private static int [] zzUnpackAction() {
int [] result = new int[16];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAction(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/**
* Translates a state to a row index in the transition table
*/
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
"\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
private static int [] zzUnpackRowMap() {
int [] result = new int[16];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
}
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int high = packed.charAt(i++) << 16;
result[j++] = high | packed.charAt(i++);
}
return j;
}
/**
* The transition table of the DFA
*/
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
"\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
"\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
"\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
"\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
"\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
"\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
"\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
"\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
"\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
"\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
"\2\0";
private static int [] zzUnpackTrans() {
int [] result = new int[169];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
}
private static int zzUnpackTrans(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
value--;
do result[j++] = value; while (--count > 0);
}
return j;
}
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
private static final int ZZ_PUSHBACK_2BIG = 2;
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
/**
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
*/
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
"\1\1\2\0";
private static int [] zzUnpackAttribute() {
int [] result = new int[16];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/** the input device */
private java.io.Reader zzReader;
/** the current state of the DFA */
private int zzState;
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
private int zzMarkedPos;
/** the current text position in the buffer */
private int zzCurrentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
/* user code: */
/** Alphanumeric sequences */
public static final String WORD_TYPE = "<ALPHANUM>";
/** Numbers */
public static final String NUMERIC_TYPE = "<NUM>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
= addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
private int posIncr;
/**
* @param source The AttributeSource to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeSource source, Reader input) {
super(source, input);
zzReader = input;
}
/**
* @param factory The AttributeFactory to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
zzReader = input;
}
/**
* Set the max allowed token length. Any token longer than this is skipped.
* @param length the new max allowed token length
*/
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/**
* Returns the max allowed token length. Any token longer than this is
* skipped.
* @return the max allowed token length
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(yychar + yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
yyreset(reader);
}
@Override
public final boolean incrementToken() throws IOException {
// This method is required because of two JFlex limitations:
// 1. No way to insert code at the beginning of the generated scanning
// get-next-token method; and
// 2. No way to declare @Override on the generated scanning method.
clearAttributes();
posIncr = 1;
return getNextToken();
}
/**
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
* the current match, the TypeAttribute from the passed-in tokenType, and
* the PositionIncrementAttribute to one, unless the immediately previous
* token(s) was/were skipped because maxTokenLength was exceeded, in which
* case the PositionIncrementAttribute is set to one plus the number of
* skipped overly long tokens.
* <p/>
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
* and false is returned.
*
* @param tokenType The type of the matching token
* @return true there is a token available (not too long); false otherwise
*/
private boolean populateAttributes(String tokenType) {
boolean isTokenAvailable = false;
if (yylength() > maxTokenLength) {
// When we skip a too-long token, we treat it like a stopword, introducing
// a position increment gap
++posIncr;
} else {
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
posIncrAtt.setPositionIncrement(posIncr);
offsetAtt.setOffset(correctOffset(yychar),
correctOffset(yychar + yylength()));
typeAtt.setType(tokenType);
isTokenAvailable = true;
}
return isTokenAvailable;
}
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
public UAX29Tokenizer(java.io.Reader in) {
super(in);
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
public UAX29Tokenizer(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.
*
* @param packed the packed character translation table
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 2174) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
}
return map;
}
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
/* first: make room (if you can) */
if (zzStartRead > 0) {
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
/* translate stored positions */
zzEndRead-= zzStartRead;
zzCurrentPos-= zzStartRead;
zzMarkedPos-= zzStartRead;
zzStartRead = 0;
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length) {
/* if not: blow it up */
char newBuffer[] = new char[zzCurrentPos*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
}
/* finally: fill the buffer with new input */
int numRead = zzReader.read(zzBuffer, zzEndRead,
zzBuffer.length-zzEndRead);
if (numRead > 0) {
zzEndRead+= numRead;
return false;
}
// unlikely but not impossible: read 0 characters, but not at end of stream
if (numRead == 0) {
int c = zzReader.read();
if (c == -1) {
return true;
} else {
zzBuffer[zzEndRead++] = (char) c;
return false;
}
}
// numRead < 0
return true;
}
/**
* Closes the input stream.
*/
private final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
if (zzReader != null)
zzReader.close();
}
/**
* Resets the scanner to read from a new input stream.
* Does not close the old reader.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
*
* Internal scan buffer is resized down to its initial length, if it has grown.
*
* @param reader the new input stream
*/
private final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
zzBuffer = new char[ZZ_BUFFERSIZE];
}
/**
* Returns the current lexical state.
*/
private final int yystate() {
return zzLexicalState;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
private final void yybegin(int newState) {
zzLexicalState = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
private final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
private final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
/**
* Returns the length of the matched text region.
*/
private final int yylength() {
return zzMarkedPos-zzStartRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
private void zzScanError(int errorCode) {
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
throw new Error(message);
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
private void yypushback(int number) {
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
// cached fields:
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
char [] zzBufferL = zzBuffer;
char [] zzCMapL = ZZ_CMAP;
int [] zzTransL = ZZ_TRANS;
int [] zzRowMapL = ZZ_ROWMAP;
int [] zzAttrL = ZZ_ATTRIBUTE;
while (true) {
zzMarkedPosL = zzMarkedPos;
yychar+= zzMarkedPosL-zzStartRead;
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
zzState = ZZ_LEXSTATE[zzLexicalState];
zzForAction: {
while (true) {
if (zzCurrentPosL < zzEndReadL)
zzInput = zzBufferL[zzCurrentPosL++];
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
}
else {
// store back cached positions
zzCurrentPos = zzCurrentPosL;
zzMarkedPos = zzMarkedPosL;
boolean eof = zzRefill();
// get translated positions and possibly new buffer
zzCurrentPosL = zzCurrentPos;
zzMarkedPosL = zzMarkedPos;
zzBufferL = zzBuffer;
zzEndReadL = zzEndRead;
if (eof) {
zzInput = YYEOF;
break zzForAction;
}
else {
zzInput = zzBufferL[zzCurrentPosL++];
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
if (zzNext == -1) break zzForAction;
zzState = zzNext;
int zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
if ( (zzAttributes & 8) == 8 ) break zzForAction;
}
}
}
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 5:
{ if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
}
case 7: break;
case 1:
{ /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 8: break;
case 3:
{ if (populateAttributes(NUMERIC_TYPE)) return true;
}
case 9: break;
case 6:
{ if (populateAttributes(HIRAGANA_TYPE)) return true;
}
case 10: break;
case 4:
{ if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
}
case 11: break;
case 2:
{ if (populateAttributes(WORD_TYPE)) return true;
}
case 12: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
{
return false;
}
}
else {
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeSource;
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeSource;
%final
%public
%apiprivate
%class UAX29Tokenizer
%class UAX29URLEmailTokenizer
%extends Tokenizer
%type boolean
%function getNextToken
@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeSource;
super(in);
%init}
// WB4. X (Extend | Format)* --> X
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
@ -77,6 +80,85 @@ MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final String WORD_TYPE = "<ALPHANUM>";
@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
/** Numbers */
public static final String NUMERIC_TYPE = "<NUM>";
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final String URL_TYPE = "<URL>";
/** E-mail addresses */
public static final String EMAIL_TYPE = "<EMAIL";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
* @param source The AttributeSource to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeSource source, Reader input) {
public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
super(source, input);
zzReader = input;
}
@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
* @param factory The AttributeFactory to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
zzReader = input;
}
@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
%%
// WB1. sot ÷
// WB2. ÷ eot
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return false; }
{URL} { if (populateAttributes(URL_TYPE)) return true; }
{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
// WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
// WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
//
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
// WB14. Any ÷ Any
// UAX#29 WB14. Any ÷ Any
//
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
// WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -27,7 +27,10 @@
as of Lucene 3.1, implements the Word Break rules from the Unicode Text
Segmentation algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
URLs and email addresses are also tokenized according to the relevant RFCs.
Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
<b>not</b> tokenized as single tokens, but are instead split up into
tokens according to the UAX#29 word break rules.
<br/>
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
<code>StandardTokenizer</code>,
<code><a href="StandardFilter">StandardFilter</a></code>,
@ -46,13 +49,11 @@
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
</li>
<li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>:
<li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>:
implements the Word Break rules from the Unicode Text Segmentation
algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
Unlike <code>StandardTokenizer</code>, URLs and email addresses are
<b>not</b> tokenized as single tokens, but are instead split up into
tokens according to the UAX#29 word break rules.
URLs and email addresses are also tokenized according to the relevant RFCs.
</li>
</ul>
</body>

View File

@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -58,63 +51,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
};
/** Passes through tokens with type "<URL>" and blocks all other types. */
private class URLFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public URLFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
private class EmailFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public EmailFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
@ -261,138 +197,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
public void testWikiURLs() throws Exception {
Reader reader = null;
String luceneResourcesWikiPage;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
luceneResourcesWikiPage = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != luceneResourcesWikiPage
&& luceneResourcesWikiPage.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, luceneResourcesWikiPage, urls);
}
public void testEmails() throws Exception {
Reader reader = null;
String randomTextWithEmails;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithEmails = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithEmails
&& randomTextWithEmails.length() > 0);
BufferedReader bufferedReader = null;
String[] emails;
try {
List<String> emailList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
emailList.add(line);
}
}
emails = emailList.toArray(new String[emailList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != emails && emails.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(emailAnalyzer, randomTextWithEmails, emails);
}
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithURLs = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithURLs
&& randomTextWithURLs.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, randomTextWithURLs, urls);
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);

View File

@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -28,7 +35,7 @@ import java.util.Arrays;
* limitations under the License.
*/
public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29Tokenizer(reader);
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
return new TokenStreamComponents(tokenizer);
}
};
/** Passes through tokens with type "<URL>" and blocks all other types. */
private class URLFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public URLFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
private class EmailFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public EmailFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
public void testTextWithNumbersSA() throws Exception {
@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
public void testWikiURLs() throws Exception {
Reader reader = null;
String luceneResourcesWikiPage;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("LuceneResourcesWikiPage.html"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
luceneResourcesWikiPage = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != luceneResourcesWikiPage
&& luceneResourcesWikiPage.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, luceneResourcesWikiPage, urls);
}
public void testEmails() throws Exception {
Reader reader = null;
String randomTextWithEmails;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("random.text.with.email.addresses.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithEmails = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithEmails
&& randomTextWithEmails.length() > 0);
BufferedReader bufferedReader = null;
String[] emails;
try {
List<String> emailList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream
("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
emailList.add(line);
}
}
emails = emailList.toArray(new String[emailList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != emails && emails.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(emailAnalyzer, randomTextWithEmails, emails);
}
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("random.text.with.urls.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithURLs = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithURLs
&& randomTextWithURLs.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream
("urls.from.random.text.with.urls.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, randomTextWithURLs, urls);
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);

View File

@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesToReuse(
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
}
/** @deprecated (3.1) for version back compat */

View File

@ -302,8 +302,10 @@ New Features
* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
tokenizer and filters to contrib/analysis-extras (rmuir)
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm
with good results for most languages. (Tom Burton-West via rmuir)
* SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
UAX#29, a unicode algorithm with good results for most languages, as well as
URL and E-mail tokenization according to the relevant RFCs.
(Tom Burton-West via rmuir)
* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import java.io.Reader;
import java.util.Map;
@ -30,14 +30,14 @@ import java.util.Map;
*
*/
public class UAX29TokenizerFactory extends BaseTokenizerFactory {
public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public UAX29Tokenizer create(Reader input) {
return new UAX29Tokenizer(input);
public UAX29URLEmailTokenizer create(Reader input) {
return new UAX29URLEmailTokenizer(input);
}
}

View File

@ -1,81 +0,0 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
/**
* A few tests based on org.apache.lucene.analysis.TestUAX29Tokenizer;
*/
public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
/**
* Test UAX29TokenizerFactory
*/
public void testUAX29Tokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
public void testArabic() throws Exception {
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
}
public void testChinese() throws Exception {
Reader reader = new StringReader("我是中国人。 ");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"", "", "", "", "", "", ""});
}
public void testKorean() throws Exception {
Reader reader = new StringReader("안녕하세요 한글입니다");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"안녕하세요", "한글입니다"});
}
public void testHyphen() throws Exception {
Reader reader = new StringReader("some-dashed-phrase");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"some", "dashed", "phrase"});
}
}

View File

@ -0,0 +1,155 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
/**
* A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
*/
public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
public void testUAX29URLEmailTokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
public void testArabic() throws Exception {
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
}
public void testChinese() throws Exception {
Reader reader = new StringReader("我是中国人。 ");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"", "", "", "", "", "", ""});
}
public void testKorean() throws Exception {
Reader reader = new StringReader("안녕하세요 한글입니다");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"안녕하세요", "한글입니다"});
}
public void testHyphen() throws Exception {
Reader reader = new StringReader("some-dashed-phrase");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"some", "dashed", "phrase"});
}
// Test with some URLs from TestUAX29URLEmailTokenizer's
// urls.from.random.text.with.urls.txt
public void testURLs() throws Exception {
String textWithURLs
= "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on\n"
+ " some extra\nWords thrown in here. "
+ "http://c5-3486.bisynxu.FR/aI.YnNms/"
+ " samba Halta gamba "
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+ " inter Locutio "
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
+ " blah Sirrah woof "
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
Reader reader = new StringReader(textWithURLs);
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {
"http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on",
"some", "extra", "Words", "thrown", "in", "here",
"http://c5-3486.bisynxu.FR/aI.YnNms/",
"samba", "Halta", "gamba",
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
"inter", "Locutio",
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
"blah", "Sirrah", "woof",
"http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
}
);
}
// Test with some emails from TestUAX29URLEmailTokenizer's
// email.addresses.from.random.text.with.email.addresses.txt
public void testEmails() throws Exception {
String textWithEmails
= " some extra\nWords thrown in here. "
+ "dJ8ngFi@avz13m.CC\n"
+ "kU-l6DS@[082.015.228.189]\n"
+ "\"%U\u0012@?\\B\"@Fl2d.md"
+ " samba Halta gamba "
+ "Bvd#@tupjv.sn\n"
+ "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
+ "~+Kdz@3mousnl.SE\n"
+ " inter Locutio "
+ "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
+ "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
+ " blah Sirrah woof "
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
Reader reader = new StringReader(textWithEmails);
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {
"some", "extra", "Words", "thrown", "in", "here",
"dJ8ngFi@avz13m.CC",
"kU-l6DS@[082.015.228.189]",
"\"%U\u0012@?\\B\"@Fl2d.md",
"samba", "Halta", "gamba",
"Bvd#@tupjv.sn",
"SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
"~+Kdz@3mousnl.SE",
"inter", "Locutio",
"C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
"}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
"blah", "Sirrah", "woof",
"lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
"lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
}
);
}
}