mirror of https://github.com/apache/lucene.git
LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043071 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b2e0f786b
commit
2b9726ae81
|
@ -9,15 +9,17 @@ API Changes
|
||||||
|
|
||||||
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
|
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
|
||||||
|
|
||||||
* LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
|
* LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in
|
||||||
the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
|
common/standard/ now implement the Word Break rules from the Unicode 6.0.0
|
||||||
as well as tokenizing URLs and email addresses according to the relevant
|
Text Segmentation algorithm (UAX#29).
|
||||||
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
|
||||||
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
|
|
||||||
|
|
||||||
* LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
|
|
||||||
(Steven Rowe)
|
|
||||||
|
|
||||||
|
ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
||||||
|
implementation and behavior.
|
||||||
|
|
||||||
|
UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
|
||||||
|
relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
|
||||||
|
(Steven Rowe, Robert Muir, Uwe Schindler)
|
||||||
|
|
||||||
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
|
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
|
||||||
can be generated. (Chris Harris via Steven Rowe)
|
can be generated. (Chris Harris via Steven Rowe)
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
|
|
||||||
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
|
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
|
||||||
|
|
||||||
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
|
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
|
||||||
|
|
||||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
|
@ -62,11 +62,11 @@
|
||||||
nobak="on" />
|
nobak="on" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
|
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
||||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||||
<classpath refid="jflex.classpath"/>
|
<classpath refid="jflex.classpath"/>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
|
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
|
||||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||||
nobak="on" />
|
nobak="on" />
|
||||||
</target>
|
</target>
|
||||||
|
|
|
@ -15,8 +15,8 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||||
// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
|
// file version from Saturday, December 4, 2010 12:34:19 PM UTC
|
||||||
// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
|
// generated on Sunday, December 5, 2010 12:24:12 AM UTC
|
||||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||||
|
|
||||||
ASCIITLD = "." (
|
ASCIITLD = "." (
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 10/3/10 9:07 AM from the specification file
|
* on 12/4/10 7:24 PM from the specification file
|
||||||
* <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
* <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||||
|
|
||||||
|
@ -630,6 +630,12 @@ public final void getText(CharTermAttribute t) {
|
||||||
|
|
||||||
zzState = ZZ_LEXSTATE[zzLexicalState];
|
zzState = ZZ_LEXSTATE[zzLexicalState];
|
||||||
|
|
||||||
|
// set up zzAction for empty match case:
|
||||||
|
int zzAttributes = zzAttrL[zzState];
|
||||||
|
if ( (zzAttributes & 1) == 1 ) {
|
||||||
|
zzAction = zzState;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
zzForAction: {
|
zzForAction: {
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -662,7 +668,7 @@ public final void getText(CharTermAttribute t) {
|
||||||
if (zzNext == -1) break zzForAction;
|
if (zzNext == -1) break zzForAction;
|
||||||
zzState = zzNext;
|
zzState = zzNext;
|
||||||
|
|
||||||
int zzAttributes = zzAttrL[zzState];
|
zzAttributes = zzAttrL[zzState];
|
||||||
if ( (zzAttributes & 1) == 1 ) {
|
if ( (zzAttributes & 1) == 1 ) {
|
||||||
zzAction = zzState;
|
zzAction = zzState;
|
||||||
zzMarkedPosL = zzCurrentPosL;
|
zzMarkedPosL = zzCurrentPosL;
|
||||||
|
@ -676,45 +682,45 @@ public final void getText(CharTermAttribute t) {
|
||||||
zzMarkedPos = zzMarkedPosL;
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||||
case 10:
|
|
||||||
{ return EMAIL;
|
|
||||||
}
|
|
||||||
case 11: break;
|
|
||||||
case 2:
|
|
||||||
{ return ALPHANUM;
|
|
||||||
}
|
|
||||||
case 12: break;
|
|
||||||
case 4:
|
|
||||||
{ return HOST;
|
|
||||||
}
|
|
||||||
case 13: break;
|
|
||||||
case 1:
|
|
||||||
{ /* ignore */
|
|
||||||
}
|
|
||||||
case 14: break;
|
|
||||||
case 8:
|
|
||||||
{ return ACRONYM_DEP;
|
|
||||||
}
|
|
||||||
case 15: break;
|
|
||||||
case 5:
|
case 5:
|
||||||
{ return NUM;
|
{ return NUM;
|
||||||
}
|
}
|
||||||
case 16: break;
|
case 11: break;
|
||||||
case 9:
|
case 9:
|
||||||
{ return ACRONYM;
|
{ return ACRONYM;
|
||||||
}
|
}
|
||||||
case 17: break;
|
case 12: break;
|
||||||
case 7:
|
case 7:
|
||||||
{ return COMPANY;
|
{ return COMPANY;
|
||||||
}
|
}
|
||||||
case 18: break;
|
case 13: break;
|
||||||
|
case 10:
|
||||||
|
{ return EMAIL;
|
||||||
|
}
|
||||||
|
case 14: break;
|
||||||
|
case 1:
|
||||||
|
{ /* ignore */
|
||||||
|
}
|
||||||
|
case 15: break;
|
||||||
case 6:
|
case 6:
|
||||||
{ return APOSTROPHE;
|
{ return APOSTROPHE;
|
||||||
}
|
}
|
||||||
case 19: break;
|
case 16: break;
|
||||||
case 3:
|
case 3:
|
||||||
{ return CJ;
|
{ return CJ;
|
||||||
}
|
}
|
||||||
|
case 17: break;
|
||||||
|
case 8:
|
||||||
|
{ return ACRONYM_DEP;
|
||||||
|
}
|
||||||
|
case 18: break;
|
||||||
|
case 2:
|
||||||
|
{ return ALPHANUM;
|
||||||
|
}
|
||||||
|
case 19: break;
|
||||||
|
case 4:
|
||||||
|
{ return HOST;
|
||||||
|
}
|
||||||
case 20: break;
|
case 20: break;
|
||||||
default:
|
default:
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
|
|
|
@ -16,6 +16,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
|
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
|
||||||
the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
|
and need to regenerate the tokenizer, only use the trunk version
|
||||||
SVN revision 597) at the moment!
|
of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
|
||||||
|
|
|
@ -83,10 +83,9 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public static final int ACRONYM_DEP = 8;
|
public static final int ACRONYM_DEP = 8;
|
||||||
|
|
||||||
public static final int URL = 9;
|
public static final int SOUTHEAST_ASIAN = 9;
|
||||||
public static final int SOUTHEAST_ASIAN = 10;
|
public static final int IDEOGRAPHIC = 10;
|
||||||
public static final int IDEOGRAPHIC = 11;
|
public static final int HIRAGANA = 11;
|
||||||
public static final int HIRAGANA = 12;
|
|
||||||
|
|
||||||
/** String token types that correspond to token type int constants */
|
/** String token types that correspond to token type int constants */
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
|
@ -99,7 +98,6 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
"<NUM>",
|
"<NUM>",
|
||||||
"<CJ>",
|
"<CJ>",
|
||||||
"<ACRONYM_DEP>",
|
"<ACRONYM_DEP>",
|
||||||
"<URL>",
|
|
||||||
"<SOUTHEAST_ASIAN>",
|
"<SOUTHEAST_ASIAN>",
|
||||||
"<IDEOGRAPHIC>",
|
"<IDEOGRAPHIC>",
|
||||||
"<HIRAGANA>"
|
"<HIRAGANA>"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,14 +23,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||||
* algorithm, as specified in
|
* algorithm, as specified in
|
||||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
|
||||||
* <p/>
|
* <p/>
|
||||||
* Tokens produced are of the following types:
|
* Tokens produced are of the following types:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||||
* <li><NUM>: A number</li>
|
* <li><NUM>: A number</li>
|
||||||
* <li><URL>: A URL</li>
|
|
||||||
* <li><EMAIL>: An email address</li>
|
|
||||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||||
|
@ -67,83 +64,6 @@ MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
|
||||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||||
|
|
||||||
|
|
||||||
// URL and E-mail syntax specifications:
|
|
||||||
//
|
|
||||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
|
||||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
|
||||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
|
||||||
// RFC-1738: Uniform Resource Locators (URL)
|
|
||||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
|
||||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
|
||||||
// RFC-5321: Simple Mail Transfer Protocol
|
|
||||||
// RFC-5322: Internet Message Format
|
|
||||||
|
|
||||||
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
|
||||||
|
|
||||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
|
||||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
|
||||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
|
||||||
|
|
||||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
|
||||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
|
||||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
|
||||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
|
||||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
|
||||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
|
||||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
|
||||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
|
||||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
|
||||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
|
||||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
|
||||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
|
||||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
|
||||||
|
|
||||||
URIunreserved = [-._~A-Za-z0-9]
|
|
||||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
|
||||||
URIsubDelims = [!$&'()*+,;=]
|
|
||||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
|
||||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
|
||||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
|
||||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
|
||||||
URIport = ":" [0-9]{1,5}
|
|
||||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
|
||||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
|
||||||
|
|
||||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
|
||||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
|
||||||
|
|
||||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
|
||||||
HTTPpath = ("/" {HTTPsegment})*
|
|
||||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
|
||||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
|
||||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
|
||||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
|
||||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
|
||||||
|
|
||||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
|
||||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
|
||||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
|
||||||
FTPscheme = [fF][tT][pP] "://"
|
|
||||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
|
||||||
|
|
||||||
FILEscheme = [fF][iI][lL][eE] "://"
|
|
||||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
|
||||||
|
|
||||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
|
||||||
|
|
||||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
|
||||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
|
||||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
|
||||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
|
||||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
|
||||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
|
||||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
|
||||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
|
||||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
|
||||||
// reminder that they are acceptable bracketed host forms.
|
|
||||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
|
||||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|
||||||
|
|
||||||
%{
|
%{
|
||||||
/** Alphanumeric sequences */
|
/** Alphanumeric sequences */
|
||||||
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
|
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
|
||||||
|
@ -151,12 +71,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
/** Numbers */
|
/** Numbers */
|
||||||
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
|
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
|
||||||
|
|
||||||
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
|
|
||||||
public static final int URL_TYPE = StandardTokenizer.URL;
|
|
||||||
|
|
||||||
/** E-mail addresses */
|
|
||||||
public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||||
|
@ -191,9 +105,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
//
|
//
|
||||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||||
|
|
||||||
{URL} { return URL_TYPE; }
|
|
||||||
{EMAIL} { return EMAIL_TYPE; }
|
|
||||||
|
|
||||||
// UAX#29 WB8. Numeric × Numeric
|
// UAX#29 WB8. Numeric × Numeric
|
||||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||||
|
|
|
@ -1,847 +0,0 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
|
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
|
||||||
* algorithm, as specified in
|
|
||||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
|
||||||
* <p/>
|
|
||||||
* Tokens produced are of the following types:
|
|
||||||
* <ul>
|
|
||||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
|
||||||
* <li><NUM>: A number</li>
|
|
||||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
|
||||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
|
||||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
|
||||||
* <li><HIRAGANA>: A single hiragana character</li>
|
|
||||||
* </ul>
|
|
||||||
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
|
|
||||||
* characters (characters above the Basic Multilingual Plane, which contains
|
|
||||||
* those up to and including U+FFFF), this scanner will not recognize them
|
|
||||||
* properly. If you need to be able to process text containing supplementary
|
|
||||||
* characters, consider using the ICU4J-backed implementation in modules/analysis/icu
|
|
||||||
* (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
|
|
||||||
* instead of this class, since the ICU4J-backed implementation does not have
|
|
||||||
* this limitation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
public final class UAX29Tokenizer extends Tokenizer {
|
|
||||||
|
|
||||||
/** This character denotes the end of file */
|
|
||||||
private static final int YYEOF = -1;
|
|
||||||
|
|
||||||
/** initial size of the lookahead buffer */
|
|
||||||
private static final int ZZ_BUFFERSIZE = 16384;
|
|
||||||
|
|
||||||
/** lexical states */
|
|
||||||
private static final int YYINITIAL = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
|
||||||
* ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
|
|
||||||
* at the beginning of a line
|
|
||||||
* l is of the form l = 2*k, k a non negative integer
|
|
||||||
*/
|
|
||||||
private static final int ZZ_LEXSTATE[] = {
|
|
||||||
0, 0
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Translates characters to character classes
|
|
||||||
*/
|
|
||||||
private static final String ZZ_CMAP_PACKED =
|
|
||||||
"\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
|
|
||||||
"\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
|
|
||||||
"\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
|
|
||||||
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
|
|
||||||
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
|
|
||||||
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
|
|
||||||
"\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
|
|
||||||
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
|
|
||||||
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
|
|
||||||
"\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
|
|
||||||
"\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
|
|
||||||
"\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
|
|
||||||
"\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
|
|
||||||
"\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
|
|
||||||
"\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
|
|
||||||
"\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
|
|
||||||
"\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
|
|
||||||
"\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
|
|
||||||
"\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
|
|
||||||
"\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
|
|
||||||
"\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
|
|
||||||
"\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
|
|
||||||
"\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
|
|
||||||
"\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
|
|
||||||
"\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
|
|
||||||
"\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
|
|
||||||
"\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
|
|
||||||
"\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
|
|
||||||
"\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
|
|
||||||
"\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
|
|
||||||
"\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
|
|
||||||
"\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
|
|
||||||
"\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
|
|
||||||
"\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
|
|
||||||
"\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
|
|
||||||
"\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
|
|
||||||
"\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
|
|
||||||
"\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
|
|
||||||
"\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
|
|
||||||
"\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
|
|
||||||
"\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
|
|
||||||
"\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
|
|
||||||
"\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
|
|
||||||
"\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
|
|
||||||
"\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
|
|
||||||
"\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
|
|
||||||
"\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
|
|
||||||
"\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
|
|
||||||
"\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
|
|
||||||
"\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
|
|
||||||
"\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
|
|
||||||
"\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
|
|
||||||
"\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
|
|
||||||
"\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
|
|
||||||
"\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
|
|
||||||
"\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
|
|
||||||
"\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
|
|
||||||
"\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
|
|
||||||
"\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
|
|
||||||
"\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
|
|
||||||
"\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
|
|
||||||
"\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
|
|
||||||
"\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
|
|
||||||
"\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
|
|
||||||
"\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
|
|
||||||
"\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
|
|
||||||
"\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
|
|
||||||
"\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
|
|
||||||
"\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
|
|
||||||
"\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
|
|
||||||
"\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
|
|
||||||
"\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
|
|
||||||
"\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
|
|
||||||
"\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
|
|
||||||
"\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
|
|
||||||
"\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
|
|
||||||
"\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
|
|
||||||
"\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
|
|
||||||
"\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
|
|
||||||
"\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
|
|
||||||
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
|
|
||||||
"\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
|
|
||||||
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
|
|
||||||
"\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
|
|
||||||
"\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
|
|
||||||
"\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
|
|
||||||
"\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
|
|
||||||
"\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
|
|
||||||
"\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
|
|
||||||
"\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
|
|
||||||
"\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
|
|
||||||
"\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
|
|
||||||
"\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
|
|
||||||
"\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
|
|
||||||
"\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
|
|
||||||
"\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
|
|
||||||
"\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
|
|
||||||
"\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
|
|
||||||
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
|
|
||||||
"\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
|
|
||||||
"\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
|
|
||||||
"\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
|
|
||||||
"\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
|
|
||||||
"\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
|
|
||||||
"\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
|
|
||||||
"\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
|
|
||||||
"\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
|
|
||||||
"\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
|
|
||||||
"\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Translates characters to character classes
|
|
||||||
*/
|
|
||||||
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Translates DFA states to action switch labels.
|
|
||||||
*/
|
|
||||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
|
||||||
|
|
||||||
private static final String ZZ_ACTION_PACKED_0 =
|
|
||||||
"\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
|
|
||||||
"\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
|
|
||||||
|
|
||||||
private static int [] zzUnpackAction() {
|
|
||||||
int [] result = new int[16];
|
|
||||||
int offset = 0;
|
|
||||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int zzUnpackAction(String packed, int offset, int [] result) {
|
|
||||||
int i = 0; /* index in packed string */
|
|
||||||
int j = offset; /* index in unpacked array */
|
|
||||||
int l = packed.length();
|
|
||||||
while (i < l) {
|
|
||||||
int count = packed.charAt(i++);
|
|
||||||
int value = packed.charAt(i++);
|
|
||||||
do result[j++] = value; while (--count > 0);
|
|
||||||
}
|
|
||||||
return j;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Translates a state to a row index in the transition table
|
|
||||||
*/
|
|
||||||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
|
||||||
|
|
||||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
|
||||||
"\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
|
|
||||||
"\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
|
|
||||||
|
|
||||||
private static int [] zzUnpackRowMap() {
|
|
||||||
int [] result = new int[16];
|
|
||||||
int offset = 0;
|
|
||||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
|
|
||||||
int i = 0; /* index in packed string */
|
|
||||||
int j = offset; /* index in unpacked array */
|
|
||||||
int l = packed.length();
|
|
||||||
while (i < l) {
|
|
||||||
int high = packed.charAt(i++) << 16;
|
|
||||||
result[j++] = high | packed.charAt(i++);
|
|
||||||
}
|
|
||||||
return j;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The transition table of the DFA
|
|
||||||
*/
|
|
||||||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
|
||||||
|
|
||||||
private static final String ZZ_TRANS_PACKED_0 =
|
|
||||||
"\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
|
|
||||||
"\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
|
|
||||||
"\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
|
|
||||||
"\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
|
|
||||||
"\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
|
|
||||||
"\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
|
|
||||||
"\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
|
|
||||||
"\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
|
|
||||||
"\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
|
|
||||||
"\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
|
|
||||||
"\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
|
|
||||||
"\2\0";
|
|
||||||
|
|
||||||
private static int [] zzUnpackTrans() {
|
|
||||||
int [] result = new int[169];
|
|
||||||
int offset = 0;
|
|
||||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int zzUnpackTrans(String packed, int offset, int [] result) {
|
|
||||||
int i = 0; /* index in packed string */
|
|
||||||
int j = offset; /* index in unpacked array */
|
|
||||||
int l = packed.length();
|
|
||||||
while (i < l) {
|
|
||||||
int count = packed.charAt(i++);
|
|
||||||
int value = packed.charAt(i++);
|
|
||||||
value--;
|
|
||||||
do result[j++] = value; while (--count > 0);
|
|
||||||
}
|
|
||||||
return j;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* error codes */
|
|
||||||
private static final int ZZ_UNKNOWN_ERROR = 0;
|
|
||||||
private static final int ZZ_NO_MATCH = 1;
|
|
||||||
private static final int ZZ_PUSHBACK_2BIG = 2;
|
|
||||||
|
|
||||||
/* error messages for the codes above */
|
|
||||||
private static final String ZZ_ERROR_MSG[] = {
|
|
||||||
"Unkown internal scanner error",
|
|
||||||
"Error: could not match input",
|
|
||||||
"Error: pushback value was too large"
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
|
|
||||||
*/
|
|
||||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
|
||||||
|
|
||||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
|
||||||
"\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
|
|
||||||
"\1\1\2\0";
|
|
||||||
|
|
||||||
private static int [] zzUnpackAttribute() {
|
|
||||||
int [] result = new int[16];
|
|
||||||
int offset = 0;
|
|
||||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
|
|
||||||
int i = 0; /* index in packed string */
|
|
||||||
int j = offset; /* index in unpacked array */
|
|
||||||
int l = packed.length();
|
|
||||||
while (i < l) {
|
|
||||||
int count = packed.charAt(i++);
|
|
||||||
int value = packed.charAt(i++);
|
|
||||||
do result[j++] = value; while (--count > 0);
|
|
||||||
}
|
|
||||||
return j;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** the input device */
|
|
||||||
private java.io.Reader zzReader;
|
|
||||||
|
|
||||||
/** the current state of the DFA */
|
|
||||||
private int zzState;
|
|
||||||
|
|
||||||
/** the current lexical state */
|
|
||||||
private int zzLexicalState = YYINITIAL;
|
|
||||||
|
|
||||||
/** this buffer contains the current text to be matched and is
|
|
||||||
the source of the yytext() string */
|
|
||||||
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
|
|
||||||
|
|
||||||
/** the textposition at the last accepting state */
|
|
||||||
private int zzMarkedPos;
|
|
||||||
|
|
||||||
/** the current text position in the buffer */
|
|
||||||
private int zzCurrentPos;
|
|
||||||
|
|
||||||
/** startRead marks the beginning of the yytext() string in the buffer */
|
|
||||||
private int zzStartRead;
|
|
||||||
|
|
||||||
/** endRead marks the last character in the buffer, that has been read
|
|
||||||
from input */
|
|
||||||
private int zzEndRead;
|
|
||||||
|
|
||||||
/** number of newlines encountered up to the start of the matched text */
|
|
||||||
private int yyline;
|
|
||||||
|
|
||||||
/** the number of characters up to the start of the matched text */
|
|
||||||
private int yychar;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the number of characters from the last newline up to the start of the
|
|
||||||
* matched text
|
|
||||||
*/
|
|
||||||
private int yycolumn;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
|
||||||
*/
|
|
||||||
private boolean zzAtBOL = true;
|
|
||||||
|
|
||||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
|
||||||
private boolean zzAtEOF;
|
|
||||||
|
|
||||||
/** denotes if the user-EOF-code has already been executed */
|
|
||||||
private boolean zzEOFDone;
|
|
||||||
|
|
||||||
/* user code: */
|
|
||||||
/** Alphanumeric sequences */
|
|
||||||
public static final String WORD_TYPE = "<ALPHANUM>";
|
|
||||||
|
|
||||||
/** Numbers */
|
|
||||||
public static final String NUMERIC_TYPE = "<NUM>";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
|
||||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
|
||||||
* together as as a single token rather than broken up, because the logic
|
|
||||||
* required to break them at word boundaries is too complex for UAX#29.
|
|
||||||
* <p>
|
|
||||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
|
||||||
*/
|
|
||||||
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
|
|
||||||
|
|
||||||
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
|
|
||||||
|
|
||||||
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
|
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
private final PositionIncrementAttribute posIncrAtt
|
|
||||||
= addAttribute(PositionIncrementAttribute.class);
|
|
||||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
|
|
||||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
|
||||||
private int posIncr;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param source The AttributeSource to use
|
|
||||||
* @param input The input reader
|
|
||||||
*/
|
|
||||||
public UAX29Tokenizer(AttributeSource source, Reader input) {
|
|
||||||
super(source, input);
|
|
||||||
zzReader = input;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param factory The AttributeFactory to use
|
|
||||||
* @param input The input reader
|
|
||||||
*/
|
|
||||||
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
|
|
||||||
super(factory, input);
|
|
||||||
zzReader = input;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the max allowed token length. Any token longer than this is skipped.
|
|
||||||
* @param length the new max allowed token length
|
|
||||||
*/
|
|
||||||
public void setMaxTokenLength(int length) {
|
|
||||||
this.maxTokenLength = length;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the max allowed token length. Any token longer than this is
|
|
||||||
* skipped.
|
|
||||||
* @return the max allowed token length
|
|
||||||
*/
|
|
||||||
public int getMaxTokenLength() {
|
|
||||||
return maxTokenLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public final void end() {
|
|
||||||
// set final offset
|
|
||||||
int finalOffset = correctOffset(yychar + yylength());
|
|
||||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reset(Reader reader) throws IOException {
|
|
||||||
super.reset(reader);
|
|
||||||
yyreset(reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public final boolean incrementToken() throws IOException {
|
|
||||||
// This method is required because of two JFlex limitations:
|
|
||||||
// 1. No way to insert code at the beginning of the generated scanning
|
|
||||||
// get-next-token method; and
|
|
||||||
// 2. No way to declare @Override on the generated scanning method.
|
|
||||||
clearAttributes();
|
|
||||||
posIncr = 1;
|
|
||||||
return getNextToken();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
|
|
||||||
* the current match, the TypeAttribute from the passed-in tokenType, and
|
|
||||||
* the PositionIncrementAttribute to one, unless the immediately previous
|
|
||||||
* token(s) was/were skipped because maxTokenLength was exceeded, in which
|
|
||||||
* case the PositionIncrementAttribute is set to one plus the number of
|
|
||||||
* skipped overly long tokens.
|
|
||||||
* <p/>
|
|
||||||
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
|
|
||||||
* and false is returned.
|
|
||||||
*
|
|
||||||
* @param tokenType The type of the matching token
|
|
||||||
* @return true there is a token available (not too long); false otherwise
|
|
||||||
*/
|
|
||||||
private boolean populateAttributes(String tokenType) {
|
|
||||||
boolean isTokenAvailable = false;
|
|
||||||
if (yylength() > maxTokenLength) {
|
|
||||||
// When we skip a too-long token, we treat it like a stopword, introducing
|
|
||||||
// a position increment gap
|
|
||||||
++posIncr;
|
|
||||||
} else {
|
|
||||||
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
|
|
||||||
posIncrAtt.setPositionIncrement(posIncr);
|
|
||||||
offsetAtt.setOffset(correctOffset(yychar),
|
|
||||||
correctOffset(yychar + yylength()));
|
|
||||||
typeAtt.setType(tokenType);
|
|
||||||
isTokenAvailable = true;
|
|
||||||
}
|
|
||||||
return isTokenAvailable;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner
|
|
||||||
* There is also a java.io.InputStream version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Reader to read input from.
|
|
||||||
*/
|
|
||||||
public UAX29Tokenizer(java.io.Reader in) {
|
|
||||||
super(in);
|
|
||||||
this.zzReader = in;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new scanner.
|
|
||||||
* There is also java.io.Reader version of this constructor.
|
|
||||||
*
|
|
||||||
* @param in the java.io.Inputstream to read input from.
|
|
||||||
*/
|
|
||||||
public UAX29Tokenizer(java.io.InputStream in) {
|
|
||||||
this(new java.io.InputStreamReader(in));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Unpacks the compressed character translation table.
|
|
||||||
*
|
|
||||||
* @param packed the packed character translation table
|
|
||||||
* @return the unpacked character translation table
|
|
||||||
*/
|
|
||||||
private static char [] zzUnpackCMap(String packed) {
|
|
||||||
char [] map = new char[0x10000];
|
|
||||||
int i = 0; /* index in packed string */
|
|
||||||
int j = 0; /* index in unpacked array */
|
|
||||||
while (i < 2174) {
|
|
||||||
int count = packed.charAt(i++);
|
|
||||||
char value = packed.charAt(i++);
|
|
||||||
do map[j++] = value; while (--count > 0);
|
|
||||||
}
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Refills the input buffer.
|
|
||||||
*
|
|
||||||
* @return <code>false</code>, iff there was new input.
|
|
||||||
*
|
|
||||||
* @exception java.io.IOException if any I/O-Error occurs
|
|
||||||
*/
|
|
||||||
private boolean zzRefill() throws java.io.IOException {
|
|
||||||
|
|
||||||
/* first: make room (if you can) */
|
|
||||||
if (zzStartRead > 0) {
|
|
||||||
System.arraycopy(zzBuffer, zzStartRead,
|
|
||||||
zzBuffer, 0,
|
|
||||||
zzEndRead-zzStartRead);
|
|
||||||
|
|
||||||
/* translate stored positions */
|
|
||||||
zzEndRead-= zzStartRead;
|
|
||||||
zzCurrentPos-= zzStartRead;
|
|
||||||
zzMarkedPos-= zzStartRead;
|
|
||||||
zzStartRead = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* is the buffer big enough? */
|
|
||||||
if (zzCurrentPos >= zzBuffer.length) {
|
|
||||||
/* if not: blow it up */
|
|
||||||
char newBuffer[] = new char[zzCurrentPos*2];
|
|
||||||
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
|
||||||
zzBuffer = newBuffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* finally: fill the buffer with new input */
|
|
||||||
int numRead = zzReader.read(zzBuffer, zzEndRead,
|
|
||||||
zzBuffer.length-zzEndRead);
|
|
||||||
|
|
||||||
if (numRead > 0) {
|
|
||||||
zzEndRead+= numRead;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// unlikely but not impossible: read 0 characters, but not at end of stream
|
|
||||||
if (numRead == 0) {
|
|
||||||
int c = zzReader.read();
|
|
||||||
if (c == -1) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
zzBuffer[zzEndRead++] = (char) c;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// numRead < 0
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Closes the input stream.
|
|
||||||
*/
|
|
||||||
private final void yyclose() throws java.io.IOException {
|
|
||||||
zzAtEOF = true; /* indicate end of file */
|
|
||||||
zzEndRead = zzStartRead; /* invalidate buffer */
|
|
||||||
|
|
||||||
if (zzReader != null)
|
|
||||||
zzReader.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resets the scanner to read from a new input stream.
|
|
||||||
* Does not close the old reader.
|
|
||||||
*
|
|
||||||
* All internal variables are reset, the old input stream
|
|
||||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
|
||||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
|
||||||
*
|
|
||||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
|
||||||
*
|
|
||||||
* @param reader the new input stream
|
|
||||||
*/
|
|
||||||
private final void yyreset(java.io.Reader reader) {
|
|
||||||
zzReader = reader;
|
|
||||||
zzAtBOL = true;
|
|
||||||
zzAtEOF = false;
|
|
||||||
zzEOFDone = false;
|
|
||||||
zzEndRead = zzStartRead = 0;
|
|
||||||
zzCurrentPos = zzMarkedPos = 0;
|
|
||||||
yyline = yychar = yycolumn = 0;
|
|
||||||
zzLexicalState = YYINITIAL;
|
|
||||||
if (zzBuffer.length > ZZ_BUFFERSIZE)
|
|
||||||
zzBuffer = new char[ZZ_BUFFERSIZE];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the current lexical state.
|
|
||||||
*/
|
|
||||||
private final int yystate() {
|
|
||||||
return zzLexicalState;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Enters a new lexical state
|
|
||||||
*
|
|
||||||
* @param newState the new lexical state
|
|
||||||
*/
|
|
||||||
private final void yybegin(int newState) {
|
|
||||||
zzLexicalState = newState;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the text matched by the current regular expression.
|
|
||||||
*/
|
|
||||||
private final String yytext() {
|
|
||||||
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the character at position <tt>pos</tt> from the
|
|
||||||
* matched text.
|
|
||||||
*
|
|
||||||
* It is equivalent to yytext().charAt(pos), but faster
|
|
||||||
*
|
|
||||||
* @param pos the position of the character to fetch.
|
|
||||||
* A value from 0 to yylength()-1.
|
|
||||||
*
|
|
||||||
* @return the character at position pos
|
|
||||||
*/
|
|
||||||
private final char yycharat(int pos) {
|
|
||||||
return zzBuffer[zzStartRead+pos];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the length of the matched text region.
|
|
||||||
*/
|
|
||||||
private final int yylength() {
|
|
||||||
return zzMarkedPos-zzStartRead;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reports an error that occured while scanning.
|
|
||||||
*
|
|
||||||
* In a wellformed scanner (no or only correct usage of
|
|
||||||
* yypushback(int) and a match-all fallback rule) this method
|
|
||||||
* will only be called with things that "Can't Possibly Happen".
|
|
||||||
* If this method is called, something is seriously wrong
|
|
||||||
* (e.g. a JFlex bug producing a faulty scanner etc.).
|
|
||||||
*
|
|
||||||
* Usual syntax/scanner level error handling should be done
|
|
||||||
* in error fallback rules.
|
|
||||||
*
|
|
||||||
* @param errorCode the code of the errormessage to display
|
|
||||||
*/
|
|
||||||
private void zzScanError(int errorCode) {
|
|
||||||
String message;
|
|
||||||
try {
|
|
||||||
message = ZZ_ERROR_MSG[errorCode];
|
|
||||||
}
|
|
||||||
catch (ArrayIndexOutOfBoundsException e) {
|
|
||||||
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new Error(message);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Pushes the specified amount of characters back into the input stream.
|
|
||||||
*
|
|
||||||
* They will be read again by then next call of the scanning method
|
|
||||||
*
|
|
||||||
* @param number the number of characters to be read again.
|
|
||||||
* This number must not be greater than yylength()!
|
|
||||||
*/
|
|
||||||
private void yypushback(int number) {
|
|
||||||
if ( number > yylength() )
|
|
||||||
zzScanError(ZZ_PUSHBACK_2BIG);
|
|
||||||
|
|
||||||
zzMarkedPos -= number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resumes scanning until the next regular expression is matched,
|
|
||||||
* the end of input is encountered or an I/O-Error occurs.
|
|
||||||
*
|
|
||||||
* @return the next token
|
|
||||||
* @exception java.io.IOException if any I/O-Error occurs
|
|
||||||
*/
|
|
||||||
private boolean getNextToken() throws java.io.IOException {
|
|
||||||
int zzInput;
|
|
||||||
int zzAction;
|
|
||||||
|
|
||||||
// cached fields:
|
|
||||||
int zzCurrentPosL;
|
|
||||||
int zzMarkedPosL;
|
|
||||||
int zzEndReadL = zzEndRead;
|
|
||||||
char [] zzBufferL = zzBuffer;
|
|
||||||
char [] zzCMapL = ZZ_CMAP;
|
|
||||||
|
|
||||||
int [] zzTransL = ZZ_TRANS;
|
|
||||||
int [] zzRowMapL = ZZ_ROWMAP;
|
|
||||||
int [] zzAttrL = ZZ_ATTRIBUTE;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
zzMarkedPosL = zzMarkedPos;
|
|
||||||
|
|
||||||
yychar+= zzMarkedPosL-zzStartRead;
|
|
||||||
|
|
||||||
zzAction = -1;
|
|
||||||
|
|
||||||
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
|
|
||||||
|
|
||||||
zzState = ZZ_LEXSTATE[zzLexicalState];
|
|
||||||
|
|
||||||
|
|
||||||
zzForAction: {
|
|
||||||
while (true) {
|
|
||||||
|
|
||||||
if (zzCurrentPosL < zzEndReadL)
|
|
||||||
zzInput = zzBufferL[zzCurrentPosL++];
|
|
||||||
else if (zzAtEOF) {
|
|
||||||
zzInput = YYEOF;
|
|
||||||
break zzForAction;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// store back cached positions
|
|
||||||
zzCurrentPos = zzCurrentPosL;
|
|
||||||
zzMarkedPos = zzMarkedPosL;
|
|
||||||
boolean eof = zzRefill();
|
|
||||||
// get translated positions and possibly new buffer
|
|
||||||
zzCurrentPosL = zzCurrentPos;
|
|
||||||
zzMarkedPosL = zzMarkedPos;
|
|
||||||
zzBufferL = zzBuffer;
|
|
||||||
zzEndReadL = zzEndRead;
|
|
||||||
if (eof) {
|
|
||||||
zzInput = YYEOF;
|
|
||||||
break zzForAction;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
zzInput = zzBufferL[zzCurrentPosL++];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
|
|
||||||
if (zzNext == -1) break zzForAction;
|
|
||||||
zzState = zzNext;
|
|
||||||
|
|
||||||
int zzAttributes = zzAttrL[zzState];
|
|
||||||
if ( (zzAttributes & 1) == 1 ) {
|
|
||||||
zzAction = zzState;
|
|
||||||
zzMarkedPosL = zzCurrentPosL;
|
|
||||||
if ( (zzAttributes & 8) == 8 ) break zzForAction;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// store back cached position
|
|
||||||
zzMarkedPos = zzMarkedPosL;
|
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
|
||||||
case 5:
|
|
||||||
{ if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
|
|
||||||
}
|
|
||||||
case 7: break;
|
|
||||||
case 1:
|
|
||||||
{ /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
|
||||||
}
|
|
||||||
case 8: break;
|
|
||||||
case 3:
|
|
||||||
{ if (populateAttributes(NUMERIC_TYPE)) return true;
|
|
||||||
}
|
|
||||||
case 9: break;
|
|
||||||
case 6:
|
|
||||||
{ if (populateAttributes(HIRAGANA_TYPE)) return true;
|
|
||||||
}
|
|
||||||
case 10: break;
|
|
||||||
case 4:
|
|
||||||
{ if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
|
|
||||||
}
|
|
||||||
case 11: break;
|
|
||||||
case 2:
|
|
||||||
{ if (populateAttributes(WORD_TYPE)) return true;
|
|
||||||
}
|
|
||||||
case 12: break;
|
|
||||||
default:
|
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
|
||||||
zzAtEOF = true;
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
zzScanError(ZZ_NO_MATCH);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
|
@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeSource;
|
||||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||||
* algorithm, as specified in
|
* algorithm, as specified in
|
||||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||||
|
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||||
* <p/>
|
* <p/>
|
||||||
* Tokens produced are of the following types:
|
* Tokens produced are of the following types:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||||
* <li><NUM>: A number</li>
|
* <li><NUM>: A number</li>
|
||||||
|
* <li><URL>: A URL</li>
|
||||||
|
* <li><EMAIL>: An email address</li>
|
||||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||||
|
@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeSource;
|
||||||
%final
|
%final
|
||||||
%public
|
%public
|
||||||
%apiprivate
|
%apiprivate
|
||||||
%class UAX29Tokenizer
|
%class UAX29URLEmailTokenizer
|
||||||
%extends Tokenizer
|
%extends Tokenizer
|
||||||
%type boolean
|
%type boolean
|
||||||
%function getNextToken
|
%function getNextToken
|
||||||
|
@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeSource;
|
||||||
super(in);
|
super(in);
|
||||||
%init}
|
%init}
|
||||||
|
|
||||||
// WB4. X (Extend | Format)* --> X
|
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||||
//
|
//
|
||||||
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
|
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
|
||||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||||
|
@ -77,6 +80,85 @@ MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
|
||||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
||||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||||
|
|
||||||
|
|
||||||
|
// URL and E-mail syntax specifications:
|
||||||
|
//
|
||||||
|
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||||
|
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||||
|
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||||
|
// RFC-1738: Uniform Resource Locators (URL)
|
||||||
|
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||||
|
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||||
|
// RFC-5321: Simple Mail Transfer Protocol
|
||||||
|
// RFC-5322: Internet Message Format
|
||||||
|
|
||||||
|
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
||||||
|
|
||||||
|
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||||
|
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||||
|
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||||
|
|
||||||
|
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||||
|
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||||
|
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||||
|
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||||
|
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||||
|
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||||
|
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||||
|
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||||
|
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||||
|
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||||
|
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||||
|
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||||
|
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||||
|
|
||||||
|
URIunreserved = [-._~A-Za-z0-9]
|
||||||
|
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||||
|
URIsubDelims = [!$&'()*+,;=]
|
||||||
|
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||||
|
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||||
|
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||||
|
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||||
|
URIport = ":" [0-9]{1,5}
|
||||||
|
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||||
|
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||||
|
|
||||||
|
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||||
|
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||||
|
|
||||||
|
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||||
|
HTTPpath = ("/" {HTTPsegment})*
|
||||||
|
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||||
|
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||||
|
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||||
|
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||||
|
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||||
|
|
||||||
|
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||||
|
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||||
|
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||||
|
FTPscheme = [fF][tT][pP] "://"
|
||||||
|
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||||
|
|
||||||
|
FILEscheme = [fF][iI][lL][eE] "://"
|
||||||
|
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||||
|
|
||||||
|
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||||
|
|
||||||
|
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||||
|
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||||
|
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||||
|
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||||
|
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||||
|
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||||
|
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||||
|
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||||
|
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||||
|
// reminder that they are acceptable bracketed host forms.
|
||||||
|
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||||
|
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
|
|
||||||
|
|
||||||
%{
|
%{
|
||||||
/** Alphanumeric sequences */
|
/** Alphanumeric sequences */
|
||||||
public static final String WORD_TYPE = "<ALPHANUM>";
|
public static final String WORD_TYPE = "<ALPHANUM>";
|
||||||
|
@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
||||||
/** Numbers */
|
/** Numbers */
|
||||||
public static final String NUMERIC_TYPE = "<NUM>";
|
public static final String NUMERIC_TYPE = "<NUM>";
|
||||||
|
|
||||||
|
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
|
||||||
|
public static final String URL_TYPE = "<URL>";
|
||||||
|
|
||||||
|
/** E-mail addresses */
|
||||||
|
public static final String EMAIL_TYPE = "<EMAIL";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||||
|
@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
||||||
* @param source The AttributeSource to use
|
* @param source The AttributeSource to use
|
||||||
* @param input The input reader
|
* @param input The input reader
|
||||||
*/
|
*/
|
||||||
public UAX29Tokenizer(AttributeSource source, Reader input) {
|
public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
|
||||||
super(source, input);
|
super(source, input);
|
||||||
zzReader = input;
|
zzReader = input;
|
||||||
}
|
}
|
||||||
|
@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
||||||
* @param factory The AttributeFactory to use
|
* @param factory The AttributeFactory to use
|
||||||
* @param input The input reader
|
* @param input The input reader
|
||||||
*/
|
*/
|
||||||
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
|
public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
|
||||||
super(factory, input);
|
super(factory, input);
|
||||||
zzReader = input;
|
zzReader = input;
|
||||||
}
|
}
|
||||||
|
@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
// WB1. sot ÷
|
// UAX#29 WB1. sot ÷
|
||||||
// WB2. ÷ eot
|
// WB2. ÷ eot
|
||||||
//
|
//
|
||||||
<<EOF>> { return false; }
|
<<EOF>> { return false; }
|
||||||
|
|
||||||
|
{URL} { if (populateAttributes(URL_TYPE)) return true; }
|
||||||
|
{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
|
||||||
|
|
||||||
// WB8. Numeric × Numeric
|
// UAX#29 WB8. Numeric × Numeric
|
||||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||||
| {MidNumericEx} {NumericEx}
|
| {MidNumericEx} {NumericEx}
|
||||||
|
@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
||||||
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
|
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
|
||||||
|
|
||||||
|
|
||||||
// WB5. ALetter × ALetter
|
// UAX#29 WB5. ALetter × ALetter
|
||||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||||
// WB9. ALetter × Numeric
|
// WB9. ALetter × Numeric
|
||||||
// WB10. Numeric × ALetter
|
// WB10. Numeric × ALetter
|
||||||
// WB13. Katakana × Katakana
|
// WB13. Katakana × Katakana
|
||||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||||
|
@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
||||||
//
|
//
|
||||||
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
|
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
|
||||||
|
|
||||||
// WB14. Any ÷ Any
|
// UAX#29 WB14. Any ÷ Any
|
||||||
//
|
//
|
||||||
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
|
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
|
||||||
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
|
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
|
||||||
|
|
||||||
|
|
||||||
// WB3. CR × LF
|
// UAX#29 WB3. CR × LF
|
||||||
// WB3a. (Newline | CR | LF) ÷
|
// WB3a. (Newline | CR | LF) ÷
|
||||||
// WB3b. ÷ (Newline | CR | LF)
|
// WB3b. ÷ (Newline | CR | LF)
|
||||||
// WB14. Any ÷ Any
|
// WB14. Any ÷ Any
|
||||||
//
|
//
|
||||||
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -27,7 +27,10 @@
|
||||||
as of Lucene 3.1, implements the Word Break rules from the Unicode Text
|
as of Lucene 3.1, implements the Word Break rules from the Unicode Text
|
||||||
Segmentation algorithm, as specified in
|
Segmentation algorithm, as specified in
|
||||||
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||||
URLs and email addresses are also tokenized according to the relevant RFCs.
|
Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
|
||||||
|
<b>not</b> tokenized as single tokens, but are instead split up into
|
||||||
|
tokens according to the UAX#29 word break rules.
|
||||||
|
<br/>
|
||||||
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
|
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
|
||||||
<code>StandardTokenizer</code>,
|
<code>StandardTokenizer</code>,
|
||||||
<code><a href="StandardFilter">StandardFilter</a></code>,
|
<code><a href="StandardFilter">StandardFilter</a></code>,
|
||||||
|
@ -46,13 +49,11 @@
|
||||||
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
|
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
|
||||||
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
|
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
|
||||||
</li>
|
</li>
|
||||||
<li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>:
|
<li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>:
|
||||||
implements the Word Break rules from the Unicode Text Segmentation
|
implements the Word Break rules from the Unicode Text Segmentation
|
||||||
algorithm, as specified in
|
algorithm, as specified in
|
||||||
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||||
Unlike <code>StandardTokenizer</code>, URLs and email addresses are
|
URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||||
<b>not</b> tokenized as single tokens, but are instead split up into
|
|
||||||
tokens according to the UAX#29 word break rules.
|
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
</body>
|
</body>
|
||||||
|
|
|
@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -58,63 +51,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Passes through tokens with type "<URL>" and blocks all other types. */
|
|
||||||
private class URLFilter extends TokenFilter {
|
|
||||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
public URLFilter(TokenStream in) {
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public final boolean incrementToken() throws java.io.IOException {
|
|
||||||
boolean isTokenAvailable = false;
|
|
||||||
while (input.incrementToken()) {
|
|
||||||
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
|
|
||||||
isTokenAvailable = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return isTokenAvailable;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
|
|
||||||
private class EmailFilter extends TokenFilter {
|
|
||||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
public EmailFilter(TokenStream in) {
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public final boolean incrementToken() throws java.io.IOException {
|
|
||||||
boolean isTokenAvailable = false;
|
|
||||||
while (input.incrementToken()) {
|
|
||||||
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
|
|
||||||
isTokenAvailable = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return isTokenAvailable;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
|
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
|
||||||
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
|
||||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
|
||||||
TokenFilter filter = new URLFilter(tokenizer);
|
|
||||||
return new TokenStreamComponents(tokenizer, filter);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
|
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
|
||||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
|
||||||
TokenFilter filter = new EmailFilter(tokenizer);
|
|
||||||
return new TokenStreamComponents(tokenizer, filter);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
public void testArmenian() throws Exception {
|
public void testArmenian() throws Exception {
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||||
|
@ -261,138 +197,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWikiURLs() throws Exception {
|
|
||||||
Reader reader = null;
|
|
||||||
String luceneResourcesWikiPage;
|
|
||||||
try {
|
|
||||||
reader = new InputStreamReader
|
|
||||||
(getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
char[] buffer = new char[1024];
|
|
||||||
int numCharsRead;
|
|
||||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
|
||||||
builder.append(buffer, 0, numCharsRead);
|
|
||||||
}
|
|
||||||
luceneResourcesWikiPage = builder.toString();
|
|
||||||
} finally {
|
|
||||||
if (null != reader) {
|
|
||||||
reader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(null != luceneResourcesWikiPage
|
|
||||||
&& luceneResourcesWikiPage.length() > 0);
|
|
||||||
BufferedReader bufferedReader = null;
|
|
||||||
String[] urls;
|
|
||||||
try {
|
|
||||||
List<String> urlList = new ArrayList<String>();
|
|
||||||
bufferedReader = new BufferedReader(new InputStreamReader
|
|
||||||
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
|
|
||||||
String line;
|
|
||||||
while (null != (line = bufferedReader.readLine())) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() > 0) {
|
|
||||||
urlList.add(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
urls = urlList.toArray(new String[urlList.size()]);
|
|
||||||
} finally {
|
|
||||||
if (null != bufferedReader) {
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(null != urls && urls.length > 0);
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
|
||||||
(urlAnalyzer, luceneResourcesWikiPage, urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testEmails() throws Exception {
|
|
||||||
Reader reader = null;
|
|
||||||
String randomTextWithEmails;
|
|
||||||
try {
|
|
||||||
reader = new InputStreamReader
|
|
||||||
(getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
char[] buffer = new char[1024];
|
|
||||||
int numCharsRead;
|
|
||||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
|
||||||
builder.append(buffer, 0, numCharsRead);
|
|
||||||
}
|
|
||||||
randomTextWithEmails = builder.toString();
|
|
||||||
} finally {
|
|
||||||
if (null != reader) {
|
|
||||||
reader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(null != randomTextWithEmails
|
|
||||||
&& randomTextWithEmails.length() > 0);
|
|
||||||
BufferedReader bufferedReader = null;
|
|
||||||
String[] emails;
|
|
||||||
try {
|
|
||||||
List<String> emailList = new ArrayList<String>();
|
|
||||||
bufferedReader = new BufferedReader(new InputStreamReader
|
|
||||||
(getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
|
|
||||||
String line;
|
|
||||||
while (null != (line = bufferedReader.readLine())) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() > 0) {
|
|
||||||
emailList.add(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
emails = emailList.toArray(new String[emailList.size()]);
|
|
||||||
} finally {
|
|
||||||
if (null != bufferedReader) {
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(null != emails && emails.length > 0);
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
|
||||||
(emailAnalyzer, randomTextWithEmails, emails);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testURLs() throws Exception {
|
|
||||||
Reader reader = null;
|
|
||||||
String randomTextWithURLs;
|
|
||||||
try {
|
|
||||||
reader = new InputStreamReader
|
|
||||||
(getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
char[] buffer = new char[1024];
|
|
||||||
int numCharsRead;
|
|
||||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
|
||||||
builder.append(buffer, 0, numCharsRead);
|
|
||||||
}
|
|
||||||
randomTextWithURLs = builder.toString();
|
|
||||||
} finally {
|
|
||||||
if (null != reader) {
|
|
||||||
reader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(null != randomTextWithURLs
|
|
||||||
&& randomTextWithURLs.length() > 0);
|
|
||||||
BufferedReader bufferedReader = null;
|
|
||||||
String[] urls;
|
|
||||||
try {
|
|
||||||
List<String> urlList = new ArrayList<String>();
|
|
||||||
bufferedReader = new BufferedReader(new InputStreamReader
|
|
||||||
(getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
|
|
||||||
String line;
|
|
||||||
while (null != (line = bufferedReader.readLine())) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() > 0) {
|
|
||||||
urlList.add(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
urls = urlList.toArray(new String[urlList.size()]);
|
|
||||||
} finally {
|
|
||||||
if (null != bufferedReader) {
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(null != urls && urls.length > 0);
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
|
||||||
(urlAnalyzer, randomTextWithURLs, urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testUnicodeWordBreaks() throws Exception {
|
public void testUnicodeWordBreaks() throws Exception {
|
||||||
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
||||||
wordBreakTest.test(a);
|
wordBreakTest.test(a);
|
||||||
|
|
|
@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
|
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -28,7 +35,7 @@ import java.util.Arrays;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testHugeDoc() throws IOException {
|
public void testHugeDoc() throws IOException {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
||||||
sb.append(whitespace);
|
sb.append(whitespace);
|
||||||
sb.append("testing 1234");
|
sb.append("testing 1234");
|
||||||
String input = sb.toString();
|
String input = sb.toString();
|
||||||
UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
|
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
|
||||||
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents
|
protected TokenStreamComponents createComponents
|
||||||
(String fieldName, Reader reader) {
|
(String fieldName, Reader reader) {
|
||||||
|
|
||||||
Tokenizer tokenizer = new UAX29Tokenizer(reader);
|
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||||
return new TokenStreamComponents(tokenizer);
|
return new TokenStreamComponents(tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/** Passes through tokens with type "<URL>" and blocks all other types. */
|
||||||
|
private class URLFilter extends TokenFilter {
|
||||||
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
public URLFilter(TokenStream in) {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
|
boolean isTokenAvailable = false;
|
||||||
|
while (input.incrementToken()) {
|
||||||
|
if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
|
||||||
|
isTokenAvailable = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isTokenAvailable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
|
||||||
|
private class EmailFilter extends TokenFilter {
|
||||||
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
public EmailFilter(TokenStream in) {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
|
boolean isTokenAvailable = false;
|
||||||
|
while (input.incrementToken()) {
|
||||||
|
if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
|
||||||
|
isTokenAvailable = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isTokenAvailable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||||
|
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||||
|
TokenFilter filter = new URLFilter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||||
|
TokenFilter filter = new EmailFilter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
public void testArmenian() throws Exception {
|
public void testArmenian() throws Exception {
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||||
|
@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTextWithNumbersSA() throws Exception {
|
public void testTextWithNumbersSA() throws Exception {
|
||||||
|
@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testWikiURLs() throws Exception {
|
||||||
|
Reader reader = null;
|
||||||
|
String luceneResourcesWikiPage;
|
||||||
|
try {
|
||||||
|
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||||
|
("LuceneResourcesWikiPage.html"), "UTF-8");
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
char[] buffer = new char[1024];
|
||||||
|
int numCharsRead;
|
||||||
|
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||||
|
builder.append(buffer, 0, numCharsRead);
|
||||||
|
}
|
||||||
|
luceneResourcesWikiPage = builder.toString();
|
||||||
|
} finally {
|
||||||
|
if (null != reader) {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(null != luceneResourcesWikiPage
|
||||||
|
&& luceneResourcesWikiPage.length() > 0);
|
||||||
|
BufferedReader bufferedReader = null;
|
||||||
|
String[] urls;
|
||||||
|
try {
|
||||||
|
List<String> urlList = new ArrayList<String>();
|
||||||
|
bufferedReader = new BufferedReader(new InputStreamReader
|
||||||
|
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
|
||||||
|
String line;
|
||||||
|
while (null != (line = bufferedReader.readLine())) {
|
||||||
|
line = line.trim();
|
||||||
|
if (line.length() > 0) {
|
||||||
|
urlList.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
urls = urlList.toArray(new String[urlList.size()]);
|
||||||
|
} finally {
|
||||||
|
if (null != bufferedReader) {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(null != urls && urls.length > 0);
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||||
|
(urlAnalyzer, luceneResourcesWikiPage, urls);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmails() throws Exception {
|
||||||
|
Reader reader = null;
|
||||||
|
String randomTextWithEmails;
|
||||||
|
try {
|
||||||
|
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||||
|
("random.text.with.email.addresses.txt"), "UTF-8");
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
char[] buffer = new char[1024];
|
||||||
|
int numCharsRead;
|
||||||
|
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||||
|
builder.append(buffer, 0, numCharsRead);
|
||||||
|
}
|
||||||
|
randomTextWithEmails = builder.toString();
|
||||||
|
} finally {
|
||||||
|
if (null != reader) {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(null != randomTextWithEmails
|
||||||
|
&& randomTextWithEmails.length() > 0);
|
||||||
|
BufferedReader bufferedReader = null;
|
||||||
|
String[] emails;
|
||||||
|
try {
|
||||||
|
List<String> emailList = new ArrayList<String>();
|
||||||
|
bufferedReader = new BufferedReader(new InputStreamReader
|
||||||
|
(getClass().getResourceAsStream
|
||||||
|
("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
|
||||||
|
String line;
|
||||||
|
while (null != (line = bufferedReader.readLine())) {
|
||||||
|
line = line.trim();
|
||||||
|
if (line.length() > 0) {
|
||||||
|
emailList.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
emails = emailList.toArray(new String[emailList.size()]);
|
||||||
|
} finally {
|
||||||
|
if (null != bufferedReader) {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(null != emails && emails.length > 0);
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||||
|
(emailAnalyzer, randomTextWithEmails, emails);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testURLs() throws Exception {
|
||||||
|
Reader reader = null;
|
||||||
|
String randomTextWithURLs;
|
||||||
|
try {
|
||||||
|
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||||
|
("random.text.with.urls.txt"), "UTF-8");
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
char[] buffer = new char[1024];
|
||||||
|
int numCharsRead;
|
||||||
|
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||||
|
builder.append(buffer, 0, numCharsRead);
|
||||||
|
}
|
||||||
|
randomTextWithURLs = builder.toString();
|
||||||
|
} finally {
|
||||||
|
if (null != reader) {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(null != randomTextWithURLs
|
||||||
|
&& randomTextWithURLs.length() > 0);
|
||||||
|
BufferedReader bufferedReader = null;
|
||||||
|
String[] urls;
|
||||||
|
try {
|
||||||
|
List<String> urlList = new ArrayList<String>();
|
||||||
|
bufferedReader = new BufferedReader(new InputStreamReader
|
||||||
|
(getClass().getResourceAsStream
|
||||||
|
("urls.from.random.text.with.urls.txt"), "UTF-8"));
|
||||||
|
String line;
|
||||||
|
while (null != (line = bufferedReader.readLine())) {
|
||||||
|
line = line.trim();
|
||||||
|
if (line.length() > 0) {
|
||||||
|
urlList.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
urls = urlList.toArray(new String[urlList.size()]);
|
||||||
|
} finally {
|
||||||
|
if (null != bufferedReader) {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(null != urls && urls.length > 0);
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||||
|
(urlAnalyzer, randomTextWithURLs, urls);
|
||||||
|
}
|
||||||
|
|
||||||
public void testUnicodeWordBreaks() throws Exception {
|
public void testUnicodeWordBreaks() throws Exception {
|
||||||
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
||||||
wordBreakTest.test(a);
|
wordBreakTest.test(a);
|
|
@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesToReuse(
|
assertAnalyzesToReuse(
|
||||||
analyzer,
|
analyzer,
|
||||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
|
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @deprecated (3.1) for version back compat */
|
/** @deprecated (3.1) for version back compat */
|
||||||
|
|
|
@ -302,8 +302,10 @@ New Features
|
||||||
* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
|
* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
|
||||||
tokenizer and filters to contrib/analysis-extras (rmuir)
|
tokenizer and filters to contrib/analysis-extras (rmuir)
|
||||||
|
|
||||||
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm
|
* SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
|
||||||
with good results for most languages. (Tom Burton-West via rmuir)
|
UAX#29, a unicode algorithm with good results for most languages, as well as
|
||||||
|
URL and E-mail tokenization according to the relevant RFCs.
|
||||||
|
(Tom Burton-West via rmuir)
|
||||||
|
|
||||||
* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
|
* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
|
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -30,14 +30,14 @@ import java.util.Map;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class UAX29TokenizerFactory extends BaseTokenizerFactory {
|
public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
|
||||||
@Override
|
@Override
|
||||||
public void init(Map<String,String> args) {
|
public void init(Map<String,String> args) {
|
||||||
super.init(args);
|
super.init(args);
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
}
|
}
|
||||||
|
|
||||||
public UAX29Tokenizer create(Reader input) {
|
public UAX29URLEmailTokenizer create(Reader input) {
|
||||||
return new UAX29Tokenizer(input);
|
return new UAX29URLEmailTokenizer(input);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,81 +0,0 @@
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A few tests based on org.apache.lucene.analysis.TestUAX29Tokenizer;
|
|
||||||
*/
|
|
||||||
|
|
||||||
public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
|
|
||||||
/**
|
|
||||||
* Test UAX29TokenizerFactory
|
|
||||||
*/
|
|
||||||
public void testUAX29Tokenizer() throws Exception {
|
|
||||||
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
|
||||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
|
||||||
Tokenizer stream = factory.create(reader);
|
|
||||||
assertTokenStreamContents(stream,
|
|
||||||
new String[] {"Wha\u0301t's", "this", "thing", "do" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testArabic() throws Exception {
|
|
||||||
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
|
|
||||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
|
||||||
Tokenizer stream = factory.create(reader);
|
|
||||||
assertTokenStreamContents(stream,
|
|
||||||
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
|
||||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testChinese() throws Exception {
|
|
||||||
Reader reader = new StringReader("我是中国人。 1234 Tests ");
|
|
||||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
|
||||||
Tokenizer stream = factory.create(reader);
|
|
||||||
assertTokenStreamContents(stream,
|
|
||||||
new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
|
|
||||||
}
|
|
||||||
public void testKorean() throws Exception {
|
|
||||||
Reader reader = new StringReader("안녕하세요 한글입니다");
|
|
||||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
|
||||||
Tokenizer stream = factory.create(reader);
|
|
||||||
assertTokenStreamContents(stream,
|
|
||||||
new String[] {"안녕하세요", "한글입니다"});
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testHyphen() throws Exception {
|
|
||||||
Reader reader = new StringReader("some-dashed-phrase");
|
|
||||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
|
||||||
Tokenizer stream = factory.create(reader);
|
|
||||||
assertTokenStreamContents(stream,
|
|
||||||
new String[] {"some", "dashed", "phrase"});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,155 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
|
public void testUAX29URLEmailTokenizer() throws Exception {
|
||||||
|
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {"Wha\u0301t's", "this", "thing", "do" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testArabic() throws Exception {
|
||||||
|
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||||
|
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testChinese() throws Exception {
|
||||||
|
Reader reader = new StringReader("我是中国人。 1234 Tests ");
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKorean() throws Exception {
|
||||||
|
Reader reader = new StringReader("안녕하세요 한글입니다");
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {"안녕하세요", "한글입니다"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHyphen() throws Exception {
|
||||||
|
Reader reader = new StringReader("some-dashed-phrase");
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {"some", "dashed", "phrase"});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test with some URLs from TestUAX29URLEmailTokenizer's
|
||||||
|
// urls.from.random.text.with.urls.txt
|
||||||
|
public void testURLs() throws Exception {
|
||||||
|
String textWithURLs
|
||||||
|
= "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on\n"
|
||||||
|
+ " some extra\nWords thrown in here. "
|
||||||
|
+ "http://c5-3486.bisynxu.FR/aI.YnNms/"
|
||||||
|
+ " samba Halta gamba "
|
||||||
|
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
|
||||||
|
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
|
||||||
|
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
|
||||||
|
+ " inter Locutio "
|
||||||
|
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
|
||||||
|
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
|
||||||
|
+ " blah Sirrah woof "
|
||||||
|
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
|
||||||
|
Reader reader = new StringReader(textWithURLs);
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {
|
||||||
|
"http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on",
|
||||||
|
"some", "extra", "Words", "thrown", "in", "here",
|
||||||
|
"http://c5-3486.bisynxu.FR/aI.YnNms/",
|
||||||
|
"samba", "Halta", "gamba",
|
||||||
|
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
|
||||||
|
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
|
||||||
|
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
|
||||||
|
"inter", "Locutio",
|
||||||
|
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
|
||||||
|
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
|
||||||
|
"blah", "Sirrah", "woof",
|
||||||
|
"http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test with some emails from TestUAX29URLEmailTokenizer's
|
||||||
|
// email.addresses.from.random.text.with.email.addresses.txt
|
||||||
|
public void testEmails() throws Exception {
|
||||||
|
String textWithEmails
|
||||||
|
= " some extra\nWords thrown in here. "
|
||||||
|
+ "dJ8ngFi@avz13m.CC\n"
|
||||||
|
+ "kU-l6DS@[082.015.228.189]\n"
|
||||||
|
+ "\"%U\u0012@?\\B\"@Fl2d.md"
|
||||||
|
+ " samba Halta gamba "
|
||||||
|
+ "Bvd#@tupjv.sn\n"
|
||||||
|
+ "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
|
||||||
|
+ "~+Kdz@3mousnl.SE\n"
|
||||||
|
+ " inter Locutio "
|
||||||
|
+ "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
|
||||||
|
+ "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
|
||||||
|
+ " blah Sirrah woof "
|
||||||
|
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
|
||||||
|
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
|
||||||
|
Reader reader = new StringReader(textWithEmails);
|
||||||
|
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||||
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
Tokenizer stream = factory.create(reader);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] {
|
||||||
|
"some", "extra", "Words", "thrown", "in", "here",
|
||||||
|
"dJ8ngFi@avz13m.CC",
|
||||||
|
"kU-l6DS@[082.015.228.189]",
|
||||||
|
"\"%U\u0012@?\\B\"@Fl2d.md",
|
||||||
|
"samba", "Halta", "gamba",
|
||||||
|
"Bvd#@tupjv.sn",
|
||||||
|
"SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
|
||||||
|
"~+Kdz@3mousnl.SE",
|
||||||
|
"inter", "Locutio",
|
||||||
|
"C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
|
||||||
|
"}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
|
||||||
|
"blah", "Sirrah", "woof",
|
||||||
|
"lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
|
||||||
|
"lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue