LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043071 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2010-12-07 14:53:13 +00:00
parent 5b2e0f786b
commit 2b9726ae81
19 changed files with 3560 additions and 3461 deletions

View File

@ -9,15 +9,17 @@ API Changes
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir) * LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
* LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement * LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in
the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29), common/standard/ now implement the Word Break rules from the Unicode 6.0.0
as well as tokenizing URLs and email addresses according to the relevant Text Segmentation algorithm (UAX#29).
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
(Steven Rowe)
ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
implementation and behavior.
UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
(Steven Rowe, Robert Muir, Uwe Schindler)
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe) can be generated. (Chris Harris via Steven Rowe)

View File

@ -38,7 +38,7 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/> <target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/> <target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present"> <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex"> <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
@ -62,11 +62,11 @@
nobak="on" /> nobak="on" />
</target> </target>
<target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present"> <target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex"> <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/> <classpath refid="jflex.classpath"/>
</taskdef> </taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex" <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
outdir="src/java/org/apache/lucene/analysis/standard" outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" /> nobak="on" />
</target> </target>

View File

@ -15,8 +15,8 @@
*/ */
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone> // Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Tuesday, October 12, 2010 11:34:09 AM UTC // file version from Saturday, December 4, 2010 12:34:19 PM UTC
// generated on Wednesday, October 13, 2010 4:12:27 AM UTC // generated on Sunday, December 5, 2010 12:24:12 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros // by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." ( ASCIITLD = "." (

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */ /* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
package org.apache.lucene.analysis.standard; package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/** /**
* This class is a scanner generated by * This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 10/3/10 9:07 AM from the specification file * on 12/4/10 7:24 PM from the specification file
* <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt> * <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/ */
class ClassicTokenizerImpl implements StandardTokenizerInterface { class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -630,6 +630,12 @@ public final void getText(CharTermAttribute t) {
zzState = ZZ_LEXSTATE[zzLexicalState]; zzState = ZZ_LEXSTATE[zzLexicalState];
// set up zzAction for empty match case:
int zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
}
zzForAction: { zzForAction: {
while (true) { while (true) {
@ -662,7 +668,7 @@ public final void getText(CharTermAttribute t) {
if (zzNext == -1) break zzForAction; if (zzNext == -1) break zzForAction;
zzState = zzNext; zzState = zzNext;
int zzAttributes = zzAttrL[zzState]; zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) { if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState; zzAction = zzState;
zzMarkedPosL = zzCurrentPosL; zzMarkedPosL = zzCurrentPosL;
@ -676,45 +682,45 @@ public final void getText(CharTermAttribute t) {
zzMarkedPos = zzMarkedPosL; zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 10:
{ return EMAIL;
}
case 11: break;
case 2:
{ return ALPHANUM;
}
case 12: break;
case 4:
{ return HOST;
}
case 13: break;
case 1:
{ /* ignore */
}
case 14: break;
case 8:
{ return ACRONYM_DEP;
}
case 15: break;
case 5: case 5:
{ return NUM; { return NUM;
} }
case 16: break; case 11: break;
case 9: case 9:
{ return ACRONYM; { return ACRONYM;
} }
case 17: break; case 12: break;
case 7: case 7:
{ return COMPANY; { return COMPANY;
} }
case 18: break; case 13: break;
case 10:
{ return EMAIL;
}
case 14: break;
case 1:
{ /* ignore */
}
case 15: break;
case 6: case 6:
{ return APOSTROPHE; { return APOSTROPHE;
} }
case 19: break; case 16: break;
case 3: case 3:
{ return CJ; { return CJ;
} }
case 17: break;
case 8:
{ return ACRONYM_DEP;
}
case 18: break;
case 2:
{ return ALPHANUM;
}
case 19: break;
case 4:
{ return HOST;
}
case 20: break; case 20: break;
default: default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

View File

@ -16,6 +16,6 @@
*/ */
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum and need to regenerate the tokenizer, only use the trunk version
SVN revision 597) at the moment! of JFlex 1.5 (with a minimum SVN revision 597) at the moment!

View File

@ -83,10 +83,9 @@ public final class StandardTokenizer extends Tokenizer {
@Deprecated @Deprecated
public static final int ACRONYM_DEP = 8; public static final int ACRONYM_DEP = 8;
public static final int URL = 9; public static final int SOUTHEAST_ASIAN = 9;
public static final int SOUTHEAST_ASIAN = 10; public static final int IDEOGRAPHIC = 10;
public static final int IDEOGRAPHIC = 11; public static final int HIRAGANA = 11;
public static final int HIRAGANA = 12;
/** String token types that correspond to token type int constants */ /** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] { public static final String [] TOKEN_TYPES = new String [] {
@ -99,7 +98,6 @@ public final class StandardTokenizer extends Tokenizer {
"<NUM>", "<NUM>",
"<CJ>", "<CJ>",
"<ACRONYM_DEP>", "<ACRONYM_DEP>",
"<URL>",
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>", "<IDEOGRAPHIC>",
"<HIRAGANA>" "<HIRAGANA>"

View File

@ -23,14 +23,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* This class implements Word Break rules from the Unicode Text Segmentation * This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in * algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
* <p/> * <p/>
* Tokens produced are of the following types: * Tokens produced are of the following types:
* <ul> * <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li> * <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li> * <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast * <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li> * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li> * <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -67,83 +64,6 @@ MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]* ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{ %{
/** Alphanumeric sequences */ /** Alphanumeric sequences */
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM; public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
@ -151,12 +71,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
/** Numbers */ /** Numbers */
public static final int NUMERIC_TYPE = StandardTokenizer.NUM; public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final int URL_TYPE = StandardTokenizer.URL;
/** E-mail addresses */
public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
/** /**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
@ -191,9 +105,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// //
<<EOF>> { return StandardTokenizerInterface.YYEOF; } <<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric // UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric // WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric // WB12. Numeric × (MidNum | MidNumLet) Numeric

View File

@ -1,847 +0,0 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
package org.apache.lucene.analysis.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* <p/>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
* characters (characters above the Basic Multilingual Plane, which contains
* those up to and including U+FFFF), this scanner will not recognize them
* properly. If you need to be able to process text containing supplementary
* characters, consider using the ICU4J-backed implementation in modules/analysis/icu
* (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
* instead of this class, since the ICU4J-backed implementation does not have
* this limitation.
*/
public final class UAX29Tokenizer extends Tokenizer {
/** This character denotes the end of file */
private static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
private static final int YYINITIAL = 0;
/**
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
* ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
* at the beginning of a line
* l is of the form l = 2*k, k a non negative integer
*/
private static final int ZZ_LEXSTATE[] = {
0, 0
};
/**
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
"\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
"\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
"\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
"\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
"\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
"\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
"\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
"\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
"\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
"\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
"\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
"\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
"\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
"\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
"\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
"\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
"\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
"\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
"\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
"\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
"\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
"\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
"\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
"\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
"\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
"\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
"\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
"\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
"\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
"\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
"\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
"\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
"\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
"\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
"\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
"\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
"\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
"\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
"\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
"\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
"\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
"\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
"\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
"\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
"\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
"\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
"\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
"\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
"\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
"\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
"\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
"\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
"\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
"\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
"\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
"\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
"\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
"\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
"\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
"\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
"\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
"\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
"\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
"\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
"\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
"\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
"\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
"\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
"\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
"\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
"\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
"\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
"\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
"\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
"\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
"\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
"\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
"\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
"\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
"\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
"\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
"\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
"\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
"\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
"\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
"\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
"\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
"\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
"\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
"\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
"\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
"\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
"\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
"\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
"\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
"\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
"\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
"\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
"\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
/**
* Translates characters to character classes
*/
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
/**
* Translates DFA states to action switch labels.
*/
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
"\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
private static int [] zzUnpackAction() {
int [] result = new int[16];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAction(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/**
* Translates a state to a row index in the transition table
*/
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
"\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
private static int [] zzUnpackRowMap() {
int [] result = new int[16];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
}
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int high = packed.charAt(i++) << 16;
result[j++] = high | packed.charAt(i++);
}
return j;
}
/**
* The transition table of the DFA
*/
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
"\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
"\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
"\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
"\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
"\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
"\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
"\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
"\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
"\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
"\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
"\2\0";
private static int [] zzUnpackTrans() {
int [] result = new int[169];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
}
private static int zzUnpackTrans(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
value--;
do result[j++] = value; while (--count > 0);
}
return j;
}
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
private static final int ZZ_PUSHBACK_2BIG = 2;
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
/**
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
*/
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
"\1\1\2\0";
private static int [] zzUnpackAttribute() {
int [] result = new int[16];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/** the input device */
private java.io.Reader zzReader;
/** the current state of the DFA */
private int zzState;
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
private int zzMarkedPos;
/** the current text position in the buffer */
private int zzCurrentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
private boolean zzAtEOF;
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
/* user code: */
/** Alphanumeric sequences */
public static final String WORD_TYPE = "<ALPHANUM>";
/** Numbers */
public static final String NUMERIC_TYPE = "<NUM>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
= addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
private int posIncr;
/**
* @param source The AttributeSource to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeSource source, Reader input) {
super(source, input);
zzReader = input;
}
/**
* @param factory The AttributeFactory to use
* @param input The input reader
*/
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
zzReader = input;
}
/**
* Set the max allowed token length. Any token longer than this is skipped.
* @param length the new max allowed token length
*/
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/**
* Returns the max allowed token length. Any token longer than this is
* skipped.
* @return the max allowed token length
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(yychar + yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
yyreset(reader);
}
@Override
public final boolean incrementToken() throws IOException {
// This method is required because of two JFlex limitations:
// 1. No way to insert code at the beginning of the generated scanning
// get-next-token method; and
// 2. No way to declare @Override on the generated scanning method.
clearAttributes();
posIncr = 1;
return getNextToken();
}
/**
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
* the current match, the TypeAttribute from the passed-in tokenType, and
* the PositionIncrementAttribute to one, unless the immediately previous
* token(s) was/were skipped because maxTokenLength was exceeded, in which
* case the PositionIncrementAttribute is set to one plus the number of
* skipped overly long tokens.
* <p/>
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
* and false is returned.
*
* @param tokenType The type of the matching token
* @return true there is a token available (not too long); false otherwise
*/
private boolean populateAttributes(String tokenType) {
boolean isTokenAvailable = false;
if (yylength() > maxTokenLength) {
// When we skip a too-long token, we treat it like a stopword, introducing
// a position increment gap
++posIncr;
} else {
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
posIncrAtt.setPositionIncrement(posIncr);
offsetAtt.setOffset(correctOffset(yychar),
correctOffset(yychar + yylength()));
typeAtt.setType(tokenType);
isTokenAvailable = true;
}
return isTokenAvailable;
}
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
public UAX29Tokenizer(java.io.Reader in) {
super(in);
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
public UAX29Tokenizer(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.
*
* @param packed the packed character translation table
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 2174) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
}
return map;
}
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
/* first: make room (if you can) */
if (zzStartRead > 0) {
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
/* translate stored positions */
zzEndRead-= zzStartRead;
zzCurrentPos-= zzStartRead;
zzMarkedPos-= zzStartRead;
zzStartRead = 0;
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length) {
/* if not: blow it up */
char newBuffer[] = new char[zzCurrentPos*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
}
/* finally: fill the buffer with new input */
int numRead = zzReader.read(zzBuffer, zzEndRead,
zzBuffer.length-zzEndRead);
if (numRead > 0) {
zzEndRead+= numRead;
return false;
}
// unlikely but not impossible: read 0 characters, but not at end of stream
if (numRead == 0) {
int c = zzReader.read();
if (c == -1) {
return true;
} else {
zzBuffer[zzEndRead++] = (char) c;
return false;
}
}
// numRead < 0
return true;
}
/**
* Closes the input stream.
*/
private final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
if (zzReader != null)
zzReader.close();
}
/**
* Resets the scanner to read from a new input stream.
* Does not close the old reader.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
*
* Internal scan buffer is resized down to its initial length, if it has grown.
*
* @param reader the new input stream
*/
private final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
zzBuffer = new char[ZZ_BUFFERSIZE];
}
/**
* Returns the current lexical state.
*/
private final int yystate() {
return zzLexicalState;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
private final void yybegin(int newState) {
zzLexicalState = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
private final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
private final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
/**
* Returns the length of the matched text region.
*/
private final int yylength() {
return zzMarkedPos-zzStartRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
private void zzScanError(int errorCode) {
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
throw new Error(message);
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
private void yypushback(int number) {
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
// cached fields:
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
char [] zzBufferL = zzBuffer;
char [] zzCMapL = ZZ_CMAP;
int [] zzTransL = ZZ_TRANS;
int [] zzRowMapL = ZZ_ROWMAP;
int [] zzAttrL = ZZ_ATTRIBUTE;
while (true) {
zzMarkedPosL = zzMarkedPos;
yychar+= zzMarkedPosL-zzStartRead;
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
zzState = ZZ_LEXSTATE[zzLexicalState];
zzForAction: {
while (true) {
if (zzCurrentPosL < zzEndReadL)
zzInput = zzBufferL[zzCurrentPosL++];
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
}
else {
// store back cached positions
zzCurrentPos = zzCurrentPosL;
zzMarkedPos = zzMarkedPosL;
boolean eof = zzRefill();
// get translated positions and possibly new buffer
zzCurrentPosL = zzCurrentPos;
zzMarkedPosL = zzMarkedPos;
zzBufferL = zzBuffer;
zzEndReadL = zzEndRead;
if (eof) {
zzInput = YYEOF;
break zzForAction;
}
else {
zzInput = zzBufferL[zzCurrentPosL++];
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
if (zzNext == -1) break zzForAction;
zzState = zzNext;
int zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
if ( (zzAttributes & 8) == 8 ) break zzForAction;
}
}
}
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 5:
{ if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
}
case 7: break;
case 1:
{ /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 8: break;
case 3:
{ if (populateAttributes(NUMERIC_TYPE)) return true;
}
case 9: break;
case 6:
{ if (populateAttributes(HIRAGANA_TYPE)) return true;
}
case 10: break;
case 4:
{ if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
}
case 11: break;
case 2:
{ if (populateAttributes(WORD_TYPE)) return true;
}
case 12: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
{
return false;
}
}
else {
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeSource;
* This class implements Word Break rules from the Unicode Text Segmentation * This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in * algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs.
* <p/> * <p/>
* Tokens produced are of the following types: * Tokens produced are of the following types:
* <ul> * <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li> * <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li> * <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast * <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li> * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li> * <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeSource;
%final %final
%public %public
%apiprivate %apiprivate
%class UAX29Tokenizer %class UAX29URLEmailTokenizer
%extends Tokenizer %extends Tokenizer
%type boolean %type boolean
%function getNextToken %function getNextToken
@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeSource;
super(in); super(in);
%init} %init}
// WB4. X (Extend | Format)* --> X // UAX#29 WB4. X (Extend | Format)* --> X
// //
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]* ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
@ -77,6 +80,85 @@ MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]* ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{ %{
/** Alphanumeric sequences */ /** Alphanumeric sequences */
public static final String WORD_TYPE = "<ALPHANUM>"; public static final String WORD_TYPE = "<ALPHANUM>";
@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
/** Numbers */ /** Numbers */
public static final String NUMERIC_TYPE = "<NUM>"; public static final String NUMERIC_TYPE = "<NUM>";
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final String URL_TYPE = "<URL>";
/** E-mail addresses */
public static final String EMAIL_TYPE = "<EMAIL";
/** /**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
* @param source The AttributeSource to use * @param source The AttributeSource to use
* @param input The input reader * @param input The input reader
*/ */
public UAX29Tokenizer(AttributeSource source, Reader input) { public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
super(source, input); super(source, input);
zzReader = input; zzReader = input;
} }
@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
* @param factory The AttributeFactory to use * @param factory The AttributeFactory to use
* @param input The input reader * @param input The input reader
*/ */
public UAX29Tokenizer(AttributeFactory factory, Reader input) { public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
super(factory, input); super(factory, input);
zzReader = input; zzReader = input;
} }
@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
%% %%
// WB1. sot ÷ // UAX#29 WB1. sot ÷
// WB2. ÷ eot // WB2. ÷ eot
// //
<<EOF>> { return false; } <<EOF>> { return false; }
{URL} { if (populateAttributes(URL_TYPE)) return true; }
{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
// WB8. Numeric × Numeric // UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric // WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric // WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet // WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana) // WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// //
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx} | {MidNumericEx} {NumericEx}
@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
{ if (populateAttributes(NUMERIC_TYPE)) return true; } { if (populateAttributes(NUMERIC_TYPE)) return true; }
// WB5. ALetter × ALetter // UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter // WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter // WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric // WB9. ALetter × Numeric
// WB10. Numeric × ALetter // WB10. Numeric × ALetter
// WB13. Katakana × Katakana // WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet // WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana) // WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// //
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* {ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})* | ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
// //
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; } \p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
// WB14. Any ÷ Any // UAX#29 WB14. Any ÷ Any
// //
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; } \p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; } \p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
// WB3. CR × LF // UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷ // WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF) // WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any // WB14. Any ÷ Any
// //
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } [^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -27,7 +27,10 @@
as of Lucene 3.1, implements the Word Break rules from the Unicode Text as of Lucene 3.1, implements the Word Break rules from the Unicode Text
Segmentation algorithm, as specified in Segmentation algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>. <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
URLs and email addresses are also tokenized according to the relevant RFCs. Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
<b>not</b> tokenized as single tokens, but are instead split up into
tokens according to the UAX#29 word break rules.
<br/>
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes <code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
<code>StandardTokenizer</code>, <code>StandardTokenizer</code>,
<code><a href="StandardFilter">StandardFilter</a></code>, <code><a href="StandardFilter">StandardFilter</a></code>,
@ -46,13 +49,11 @@
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code> <code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>. and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
</li> </li>
<li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>: <li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>:
implements the Word Break rules from the Unicode Text Segmentation implements the Word Break rules from the Unicode Text Segmentation
algorithm, as specified in algorithm, as specified in
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>. <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
Unlike <code>StandardTokenizer</code>, URLs and email addresses are URLs and email addresses are also tokenized according to the relevant RFCs.
<b>not</b> tokenized as single tokens, but are instead split up into
tokens according to the UAX#29 word break rules.
</li> </li>
</ul> </ul>
</body> </body>

View File

@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -58,63 +51,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
} }
}; };
/** Passes through tokens with type "<URL>" and blocks all other types. */
private class URLFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public URLFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
private class EmailFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public EmailFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
public void testArmenian() throws Exception { public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
@ -261,138 +197,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" }); new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
} }
public void testWikiURLs() throws Exception {
Reader reader = null;
String luceneResourcesWikiPage;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
luceneResourcesWikiPage = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != luceneResourcesWikiPage
&& luceneResourcesWikiPage.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, luceneResourcesWikiPage, urls);
}
public void testEmails() throws Exception {
Reader reader = null;
String randomTextWithEmails;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithEmails = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithEmails
&& randomTextWithEmails.length() > 0);
BufferedReader bufferedReader = null;
String[] emails;
try {
List<String> emailList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
emailList.add(line);
}
}
emails = emailList.toArray(new String[emailList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != emails && emails.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(emailAnalyzer, randomTextWithEmails, emails);
}
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;
try {
reader = new InputStreamReader
(getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithURLs = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithURLs
&& randomTextWithURLs.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, randomTextWithURLs, urls);
}
public void testUnicodeWordBreaks() throws Exception { public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0(); WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a); wordBreakTest.test(a);

View File

@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.UAX29Tokenizer; import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -28,7 +35,7 @@ import java.util.Arrays;
* limitations under the License. * limitations under the License.
*/ */
public class TestUAX29Tokenizer extends BaseTokenStreamTestCase { public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
public void testHugeDoc() throws IOException { public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace); sb.append(whitespace);
sb.append("testing 1234"); sb.append("testing 1234");
String input = sb.toString(); String input = sb.toString();
UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input)); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
} }
@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents protected TokenStreamComponents createComponents
(String fieldName, Reader reader) { (String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29Tokenizer(reader); Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
return new TokenStreamComponents(tokenizer); return new TokenStreamComponents(tokenizer);
} }
}; };
/** Passes through tokens with type "<URL>" and blocks all other types. */
private class URLFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public URLFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
private class EmailFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public EmailFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
public void testArmenian() throws Exception { public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
} }
public void testTextWithNumbersSA() throws Exception { public void testTextWithNumbersSA() throws Exception {
@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" }); new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
} }
public void testWikiURLs() throws Exception {
Reader reader = null;
String luceneResourcesWikiPage;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("LuceneResourcesWikiPage.html"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
luceneResourcesWikiPage = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != luceneResourcesWikiPage
&& luceneResourcesWikiPage.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, luceneResourcesWikiPage, urls);
}
public void testEmails() throws Exception {
Reader reader = null;
String randomTextWithEmails;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("random.text.with.email.addresses.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithEmails = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithEmails
&& randomTextWithEmails.length() > 0);
BufferedReader bufferedReader = null;
String[] emails;
try {
List<String> emailList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream
("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
emailList.add(line);
}
}
emails = emailList.toArray(new String[emailList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != emails && emails.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(emailAnalyzer, randomTextWithEmails, emails);
}
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;
try {
reader = new InputStreamReader(getClass().getResourceAsStream
("random.text.with.urls.txt"), "UTF-8");
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithURLs = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithURLs
&& randomTextWithURLs.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<String>();
bufferedReader = new BufferedReader(new InputStreamReader
(getClass().getResourceAsStream
("urls.from.random.text.with.urls.txt"), "UTF-8"));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo
(urlAnalyzer, randomTextWithURLs, urls);
}
public void testUnicodeWordBreaks() throws Exception { public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0(); WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a); wordBreakTest.test(a);

View File

@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesToReuse( assertAnalyzesToReuse(
analyzer, analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" }); new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
} }
/** @deprecated (3.1) for version back compat */ /** @deprecated (3.1) for version back compat */

View File

@ -302,8 +302,10 @@ New Features
* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese) * SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
tokenizer and filters to contrib/analysis-extras (rmuir) tokenizer and filters to contrib/analysis-extras (rmuir)
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm * SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
with good results for most languages. (Tom Burton-West via rmuir) UAX#29, a unicode algorithm with good results for most languages, as well as
URL and E-mail tokenization according to the relevant RFCs.
(Tom Burton-West via rmuir)
* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir) * SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)

View File

@ -20,7 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.standard.UAX29Tokenizer; import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.Map; import java.util.Map;
@ -30,14 +30,14 @@ import java.util.Map;
* *
*/ */
public class UAX29TokenizerFactory extends BaseTokenizerFactory { public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
@Override @Override
public void init(Map<String,String> args) { public void init(Map<String,String> args) {
super.init(args); super.init(args);
assureMatchVersion(); assureMatchVersion();
} }
public UAX29Tokenizer create(Reader input) { public UAX29URLEmailTokenizer create(Reader input) {
return new UAX29Tokenizer(input); return new UAX29URLEmailTokenizer(input);
} }
} }

View File

@ -1,81 +0,0 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
/**
* A few tests based on org.apache.lucene.analysis.TestUAX29Tokenizer;
*/
public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
/**
* Test UAX29TokenizerFactory
*/
public void testUAX29Tokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
public void testArabic() throws Exception {
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
}
public void testChinese() throws Exception {
Reader reader = new StringReader("我是中国人。 ");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"", "", "", "", "", "", ""});
}
public void testKorean() throws Exception {
Reader reader = new StringReader("안녕하세요 한글입니다");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"안녕하세요", "한글입니다"});
}
public void testHyphen() throws Exception {
Reader reader = new StringReader("some-dashed-phrase");
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"some", "dashed", "phrase"});
}
}

View File

@ -0,0 +1,155 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
/**
* A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
*/
public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
public void testUAX29URLEmailTokenizer() throws Exception {
Reader reader = new StringReader("Wha\u0301t's this thing do?");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"Wha\u0301t's", "this", "thing", "do" });
}
public void testArabic() throws Exception {
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
}
public void testChinese() throws Exception {
Reader reader = new StringReader("我是中国人。 ");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"", "", "", "", "", "", ""});
}
public void testKorean() throws Exception {
Reader reader = new StringReader("안녕하세요 한글입니다");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"안녕하세요", "한글입니다"});
}
public void testHyphen() throws Exception {
Reader reader = new StringReader("some-dashed-phrase");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"some", "dashed", "phrase"});
}
// Test with some URLs from TestUAX29URLEmailTokenizer's
// urls.from.random.text.with.urls.txt
public void testURLs() throws Exception {
String textWithURLs
= "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on\n"
+ " some extra\nWords thrown in here. "
+ "http://c5-3486.bisynxu.FR/aI.YnNms/"
+ " samba Halta gamba "
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+ " inter Locutio "
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
+ " blah Sirrah woof "
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
Reader reader = new StringReader(textWithURLs);
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {
"http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on",
"some", "extra", "Words", "thrown", "in", "here",
"http://c5-3486.bisynxu.FR/aI.YnNms/",
"samba", "Halta", "gamba",
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
"inter", "Locutio",
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
"blah", "Sirrah", "woof",
"http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
}
);
}
// Test with some emails from TestUAX29URLEmailTokenizer's
// email.addresses.from.random.text.with.email.addresses.txt
public void testEmails() throws Exception {
String textWithEmails
= " some extra\nWords thrown in here. "
+ "dJ8ngFi@avz13m.CC\n"
+ "kU-l6DS@[082.015.228.189]\n"
+ "\"%U\u0012@?\\B\"@Fl2d.md"
+ " samba Halta gamba "
+ "Bvd#@tupjv.sn\n"
+ "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
+ "~+Kdz@3mousnl.SE\n"
+ " inter Locutio "
+ "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
+ "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
+ " blah Sirrah woof "
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
Reader reader = new StringReader(textWithEmails);
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {
"some", "extra", "Words", "thrown", "in", "here",
"dJ8ngFi@avz13m.CC",
"kU-l6DS@[082.015.228.189]",
"\"%U\u0012@?\\B\"@Fl2d.md",
"samba", "Halta", "gamba",
"Bvd#@tupjv.sn",
"SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
"~+Kdz@3mousnl.SE",
"inter", "Locutio",
"C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
"}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
"blah", "Sirrah", "woof",
"lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
"lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
}
);
}
}