mirror of https://github.com/apache/lucene.git
LUCENE-2763: Swap URL+Email recognizing StandardTokenizer and UAX29Tokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043071 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b2e0f786b
commit
2b9726ae81
|
@ -9,15 +9,17 @@ API Changes
|
|||
|
||||
* LUCENE-2413: Removed the AnalyzerUtil in common/miscellaneous. (Robert Muir)
|
||||
|
||||
* LUCENE-2167: StandardTokenizer/Analyzer in common/standard/ now implement
|
||||
the Word Break rules from the Unicode Text Segmentation algorithm (UAX#29),
|
||||
as well as tokenizing URLs and email addresses according to the relevant
|
||||
RFCs. ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
||||
behavior. (Steven Rowe, Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-2699: Update StandardTokenizer and UAX29Tokenizer to Unicode 6.0.0.
|
||||
(Steven Rowe)
|
||||
* LUCENE-2167,LUCENE-2699,LUCENE-2763: StandardTokenizer/Analyzer in
|
||||
common/standard/ now implement the Word Break rules from the Unicode 6.0.0
|
||||
Text Segmentation algorithm (UAX#29).
|
||||
|
||||
ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
|
||||
implementation and behavior.
|
||||
|
||||
UAX29URLEmailTokenizer tokenizes URLs and E-mail addresses according to the
|
||||
relevant RFCs, in addition to implementing the UAX#29 Word Break rules.
|
||||
(Steven Rowe, Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
|
||||
can be generated. (Chris Harris via Steven Rowe)
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
|
||||
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
|
||||
|
||||
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29Tokenizer,jflex-wiki-tokenizer"/>
|
||||
<target name="jflex" depends="jflex-check,clean-jflex,jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
|
||||
|
||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
|
@ -62,11 +62,11 @@
|
|||
nobak="on" />
|
||||
</target>
|
||||
|
||||
<target name="jflex-UAX29Tokenizer" depends="jflex-check" if="jflex.present">
|
||||
<target name="jflex-UAX29URLEmailTokenizer" depends="jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
</target>
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Tuesday, October 12, 2010 11:34:09 AM UTC
|
||||
// generated on Wednesday, October 13, 2010 4:12:27 AM UTC
|
||||
// file version from Saturday, December 4, 2010 12:34:19 PM UTC
|
||||
// generated on Sunday, December 5, 2010 12:24:12 AM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 12/4/10 7:24 PM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 10/3/10 9:07 AM from the specification file
|
||||
* <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
* on 12/4/10 7:24 PM from the specification file
|
||||
* <tt>C:/cygwin/home/us/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||
|
||||
|
@ -630,6 +630,12 @@ public final void getText(CharTermAttribute t) {
|
|||
|
||||
zzState = ZZ_LEXSTATE[zzLexicalState];
|
||||
|
||||
// set up zzAction for empty match case:
|
||||
int zzAttributes = zzAttrL[zzState];
|
||||
if ( (zzAttributes & 1) == 1 ) {
|
||||
zzAction = zzState;
|
||||
}
|
||||
|
||||
|
||||
zzForAction: {
|
||||
while (true) {
|
||||
|
@ -662,7 +668,7 @@ public final void getText(CharTermAttribute t) {
|
|||
if (zzNext == -1) break zzForAction;
|
||||
zzState = zzNext;
|
||||
|
||||
int zzAttributes = zzAttrL[zzState];
|
||||
zzAttributes = zzAttrL[zzState];
|
||||
if ( (zzAttributes & 1) == 1 ) {
|
||||
zzAction = zzState;
|
||||
zzMarkedPosL = zzCurrentPosL;
|
||||
|
@ -676,45 +682,45 @@ public final void getText(CharTermAttribute t) {
|
|||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 10:
|
||||
{ return EMAIL;
|
||||
}
|
||||
case 11: break;
|
||||
case 2:
|
||||
{ return ALPHANUM;
|
||||
}
|
||||
case 12: break;
|
||||
case 4:
|
||||
{ return HOST;
|
||||
}
|
||||
case 13: break;
|
||||
case 1:
|
||||
{ /* ignore */
|
||||
}
|
||||
case 14: break;
|
||||
case 8:
|
||||
{ return ACRONYM_DEP;
|
||||
}
|
||||
case 15: break;
|
||||
case 5:
|
||||
{ return NUM;
|
||||
}
|
||||
case 16: break;
|
||||
case 11: break;
|
||||
case 9:
|
||||
{ return ACRONYM;
|
||||
}
|
||||
case 17: break;
|
||||
case 12: break;
|
||||
case 7:
|
||||
{ return COMPANY;
|
||||
}
|
||||
case 18: break;
|
||||
case 13: break;
|
||||
case 10:
|
||||
{ return EMAIL;
|
||||
}
|
||||
case 14: break;
|
||||
case 1:
|
||||
{ /* ignore */
|
||||
}
|
||||
case 15: break;
|
||||
case 6:
|
||||
{ return APOSTROPHE;
|
||||
}
|
||||
case 19: break;
|
||||
case 16: break;
|
||||
case 3:
|
||||
{ return CJ;
|
||||
}
|
||||
case 17: break;
|
||||
case 8:
|
||||
{ return ACRONYM_DEP;
|
||||
}
|
||||
case 18: break;
|
||||
case 2:
|
||||
{ return ALPHANUM;
|
||||
}
|
||||
case 19: break;
|
||||
case 4:
|
||||
{ return HOST;
|
||||
}
|
||||
case 20: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
|
|
|
@ -16,6 +16,6 @@
|
|||
*/
|
||||
|
||||
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate
|
||||
the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum
|
||||
SVN revision 597) at the moment!
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
|
||||
and need to regenerate the tokenizer, only use the trunk version
|
||||
of JFlex 1.5 (with a minimum SVN revision 597) at the moment!
|
||||
|
|
|
@ -83,10 +83,9 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
@Deprecated
|
||||
public static final int ACRONYM_DEP = 8;
|
||||
|
||||
public static final int URL = 9;
|
||||
public static final int SOUTHEAST_ASIAN = 10;
|
||||
public static final int IDEOGRAPHIC = 11;
|
||||
public static final int HIRAGANA = 12;
|
||||
public static final int SOUTHEAST_ASIAN = 9;
|
||||
public static final int IDEOGRAPHIC = 10;
|
||||
public static final int HIRAGANA = 11;
|
||||
|
||||
/** String token types that correspond to token type int constants */
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
|
@ -99,7 +98,6 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
"<NUM>",
|
||||
"<CJ>",
|
||||
"<ACRONYM_DEP>",
|
||||
"<URL>",
|
||||
"<SOUTHEAST_ASIAN>",
|
||||
"<IDEOGRAPHIC>",
|
||||
"<HIRAGANA>"
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,14 +23,11 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
* <p/>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
* <li><NUM>: A number</li>
|
||||
* <li><URL>: A URL</li>
|
||||
* <li><EMAIL>: An email address</li>
|
||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
|
@ -67,83 +64,6 @@ MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
|
|||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||
// RFC-1738: Uniform Resource Locators (URL)
|
||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||
|
||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||
|
||||
URIunreserved = [-._~A-Za-z0-9]
|
||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||
URIsubDelims = [!$&'()*+,;=]
|
||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIport = ":" [0-9]{1,5}
|
||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||
|
||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||
|
||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||
HTTPpath = ("/" {HTTPsegment})*
|
||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||
|
||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||
FTPscheme = [fF][tT][pP] "://"
|
||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||
|
||||
FILEscheme = [fF][iI][lL][eE] "://"
|
||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||
|
||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||
|
||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||
// reminder that they are acceptable bracketed host forms.
|
||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
|
||||
|
@ -151,12 +71,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
/** Numbers */
|
||||
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
|
||||
|
||||
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
|
||||
public static final int URL_TYPE = StandardTokenizer.URL;
|
||||
|
||||
/** E-mail addresses */
|
||||
public static final int EMAIL_TYPE = StandardTokenizer.EMAIL;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
|
@ -191,9 +105,6 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
{URL} { return URL_TYPE; }
|
||||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
|
|
|
@ -1,847 +0,0 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/13/10 12:12 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* <p/>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
* <li><NUM>: A number</li>
|
||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* </ul>
|
||||
* <b>WARNING</b>: Because JFlex does not support Unicode supplementary
|
||||
* characters (characters above the Basic Multilingual Plane, which contains
|
||||
* those up to and including U+FFFF), this scanner will not recognize them
|
||||
* properly. If you need to be able to process text containing supplementary
|
||||
* characters, consider using the ICU4J-backed implementation in modules/analysis/icu
|
||||
* (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
|
||||
* instead of this class, since the ICU4J-backed implementation does not have
|
||||
* this limitation.
|
||||
*/
|
||||
|
||||
public final class UAX29Tokenizer extends Tokenizer {
|
||||
|
||||
/** This character denotes the end of file */
|
||||
private static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
|
||||
/** lexical states */
|
||||
private static final int YYINITIAL = 0;
|
||||
|
||||
/**
|
||||
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
||||
* ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
|
||||
* at the beginning of a line
|
||||
* l is of the form l = 2*k, k a non negative integer
|
||||
*/
|
||||
private static final int ZZ_LEXSTATE[] = {
|
||||
0, 0
|
||||
};
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
*/
|
||||
private static final String ZZ_CMAP_PACKED =
|
||||
"\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
|
||||
"\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
|
||||
"\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
|
||||
"\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
|
||||
"\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
|
||||
"\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
|
||||
"\1\0\7\2\236\1\11\0\46\1\2\0\1\1\7\0\47\1\1\0"+
|
||||
"\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
|
||||
"\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
|
||||
"\2\0\13\2\5\0\53\1\25\2\12\3\1\0\1\3\1\6\1\0"+
|
||||
"\2\1\1\2\143\1\1\0\1\1\10\2\1\0\6\2\2\1\2\2"+
|
||||
"\1\0\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1"+
|
||||
"\1\2\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1"+
|
||||
"\11\2\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1"+
|
||||
"\11\2\1\1\3\2\1\1\5\2\22\0\31\1\3\2\244\0\4\2"+
|
||||
"\66\1\3\2\1\1\22\2\1\1\7\2\12\1\2\2\2\0\12\3"+
|
||||
"\1\0\7\1\1\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1"+
|
||||
"\2\0\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2"+
|
||||
"\1\1\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0"+
|
||||
"\2\1\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0"+
|
||||
"\6\1\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0"+
|
||||
"\2\1\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0"+
|
||||
"\3\2\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2"+
|
||||
"\3\1\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1"+
|
||||
"\1\0\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2"+
|
||||
"\1\0\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0"+
|
||||
"\12\3\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0"+
|
||||
"\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0"+
|
||||
"\2\2\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2"+
|
||||
"\2\0\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0"+
|
||||
"\3\1\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0"+
|
||||
"\2\1\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0"+
|
||||
"\4\2\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0"+
|
||||
"\10\1\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0"+
|
||||
"\1\1\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1"+
|
||||
"\6\0\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0"+
|
||||
"\3\1\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1"+
|
||||
"\7\2\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0"+
|
||||
"\2\1\2\2\2\0\12\3\1\0\2\1\17\0\2\2\1\0\10\1"+
|
||||
"\1\0\3\1\1\0\51\1\2\0\1\1\7\2\1\0\3\2\1\0"+
|
||||
"\4\2\1\1\10\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0"+
|
||||
"\6\1\2\0\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0"+
|
||||
"\1\1\2\0\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0"+
|
||||
"\10\2\22\0\2\2\15\0\60\11\1\12\2\11\7\12\5\0\7\11"+
|
||||
"\10\12\1\0\12\3\47\0\2\11\1\0\1\11\2\0\2\11\1\0"+
|
||||
"\1\11\2\0\1\11\6\0\4\11\1\0\7\11\1\0\3\11\1\0"+
|
||||
"\1\11\1\0\1\11\2\0\2\11\1\0\4\11\1\12\2\11\6\12"+
|
||||
"\1\0\2\12\1\11\2\0\5\11\1\0\1\11\1\0\6\12\2\0"+
|
||||
"\12\3\2\0\2\11\42\0\1\1\27\0\2\2\6\0\12\3\13\0"+
|
||||
"\1\2\1\0\1\2\1\0\1\2\4\0\2\2\10\1\1\0\44\1"+
|
||||
"\4\0\24\2\1\0\2\2\5\1\13\2\1\0\44\2\11\0\1\2"+
|
||||
"\71\0\53\11\24\12\1\11\12\3\6\0\6\11\4\12\4\11\3\12"+
|
||||
"\1\11\3\12\2\11\7\12\3\11\4\12\15\11\14\12\1\11\1\12"+
|
||||
"\12\3\4\12\2\11\46\1\12\0\53\1\1\0\1\1\3\0\u0149\1"+
|
||||
"\1\0\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\51\1"+
|
||||
"\1\0\4\1\2\0\41\1\1\0\4\1\2\0\7\1\1\0\1\1"+
|
||||
"\1\0\4\1\2\0\17\1\1\0\71\1\1\0\4\1\2\0\103\1"+
|
||||
"\2\0\3\2\40\0\20\1\20\0\125\1\14\0\u026c\1\2\0\21\1"+
|
||||
"\1\0\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0\4\1"+
|
||||
"\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1\1\0"+
|
||||
"\3\1\1\0\2\2\14\0\64\11\40\12\3\0\1\11\4\0\1\11"+
|
||||
"\1\12\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1\10\0"+
|
||||
"\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2\4\0"+
|
||||
"\14\2\12\0\12\3\36\11\2\0\5\11\13\0\54\11\4\0\21\12"+
|
||||
"\7\11\2\12\6\0\12\3\1\11\3\0\2\11\40\0\27\1\5\2"+
|
||||
"\4\0\65\11\12\12\1\0\35\12\2\0\1\2\12\3\6\0\12\3"+
|
||||
"\6\0\16\11\122\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
|
||||
"\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\6\0\46\1"+
|
||||
"\16\2\14\0\44\1\24\2\10\0\12\3\3\0\3\1\12\3\44\1"+
|
||||
"\122\0\3\2\1\0\25\2\4\1\1\2\4\1\1\2\15\0\300\1"+
|
||||
"\47\2\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1"+
|
||||
"\2\0\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1"+
|
||||
"\2\0\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1"+
|
||||
"\3\0\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1"+
|
||||
"\17\0\4\2\10\0\2\7\12\0\1\7\2\0\1\5\2\0\5\2"+
|
||||
"\20\0\2\10\3\0\1\6\17\0\1\10\13\0\5\2\5\0\6\2"+
|
||||
"\1\0\1\1\15\0\1\1\20\0\15\1\63\0\41\2\21\0\1\1"+
|
||||
"\4\0\1\1\2\0\12\1\1\0\1\1\3\0\5\1\6\0\1\1"+
|
||||
"\1\0\1\1\1\0\1\1\1\0\4\1\1\0\13\1\2\0\4\1"+
|
||||
"\5\0\5\1\4\0\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1"+
|
||||
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\16\0\46\1\12\0"+
|
||||
"\66\1\11\0\1\1\17\0\1\2\27\1\11\0\7\1\1\0\7\1"+
|
||||
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
|
||||
"\1\0\7\1\1\0\40\2\57\0\1\1\120\0\32\13\1\0\131\13"+
|
||||
"\14\0\326\13\57\0\1\1\1\0\1\13\31\0\11\13\6\2\1\0"+
|
||||
"\5\4\2\0\3\13\1\1\1\1\4\0\126\14\2\0\2\2\2\4"+
|
||||
"\3\14\133\4\1\0\4\4\5\0\51\1\3\0\136\1\21\0\33\1"+
|
||||
"\65\0\20\4\320\0\57\4\1\0\130\4\250\0\u19b6\13\112\0\u51cc\13"+
|
||||
"\64\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\3\2\1"+
|
||||
"\24\0\57\1\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2"+
|
||||
"\45\0\11\1\2\0\147\1\2\0\4\1\1\0\2\1\16\0\12\1"+
|
||||
"\120\0\10\1\1\2\3\1\1\2\4\1\1\2\27\1\5\2\30\0"+
|
||||
"\64\1\14\0\2\2\62\1\21\2\13\0\12\3\6\0\22\2\6\1"+
|
||||
"\3\0\1\1\4\0\12\3\34\1\10\2\2\0\27\1\15\2\14\0"+
|
||||
"\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\3\46\0\51\1"+
|
||||
"\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\3\6\0\33\11"+
|
||||
"\1\12\4\0\60\11\1\12\1\11\3\12\2\11\2\12\5\11\2\12"+
|
||||
"\1\11\1\12\1\11\30\0\5\11\41\0\6\1\2\0\6\1\2\0"+
|
||||
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
|
||||
"\2\0\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\13"+
|
||||
"\2\0\76\13\2\0\152\13\46\0\7\1\14\0\5\1\5\0\1\1"+
|
||||
"\1\2\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1"+
|
||||
"\1\0\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1"+
|
||||
"\50\0\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2"+
|
||||
"\14\0\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5"+
|
||||
"\32\0\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6"+
|
||||
"\1\0\1\7\1\0\12\3\1\5\1\6\5\0\32\1\4\0\1\10"+
|
||||
"\1\0\32\1\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1"+
|
||||
"\2\0\6\1\2\0\3\1\34\0\3\2\4\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
*/
|
||||
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
|
||||
|
||||
/**
|
||||
* Translates DFA states to action switch labels.
|
||||
*/
|
||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\1\0\1\1\1\2\1\3\1\2\1\1\1\4\1\5"+
|
||||
"\1\6\1\2\1\0\1\2\1\0\1\3\2\0";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[16];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackAction(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Translates a state to a row index in the transition table
|
||||
*/
|
||||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||
|
||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\15\0\32\0\47\0\64\0\101\0\116\0\15"+
|
||||
"\0\15\0\133\0\150\0\165\0\202\0\217\0\101\0\234";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[16];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int high = packed.charAt(i++) << 16;
|
||||
result[j++] = high | packed.charAt(i++);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/**
|
||||
* The transition table of the DFA
|
||||
*/
|
||||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||
|
||||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\2\1\3\1\2\1\4\1\5\3\2\1\6\2\7"+
|
||||
"\1\10\1\11\16\0\2\3\1\12\1\0\1\13\1\0"+
|
||||
"\1\13\1\14\1\0\1\3\3\0\1\3\2\4\2\0"+
|
||||
"\2\15\1\16\1\0\1\4\4\0\1\5\1\0\1\5"+
|
||||
"\3\0\1\14\1\0\1\5\3\0\1\3\1\17\1\4"+
|
||||
"\1\5\3\0\1\17\1\0\1\17\13\0\2\7\3\0"+
|
||||
"\1\3\2\12\2\0\2\20\1\14\1\0\1\12\3\0"+
|
||||
"\1\3\1\13\7\0\1\13\3\0\1\3\1\14\1\12"+
|
||||
"\1\5\3\0\1\14\1\0\1\14\4\0\1\15\1\4"+
|
||||
"\6\0\1\15\3\0\1\3\1\16\1\4\1\5\3\0"+
|
||||
"\1\16\1\0\1\16\4\0\1\20\1\12\6\0\1\20"+
|
||||
"\2\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[169];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackTrans(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
value--;
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
/* error codes */
|
||||
private static final int ZZ_UNKNOWN_ERROR = 0;
|
||||
private static final int ZZ_NO_MATCH = 1;
|
||||
private static final int ZZ_PUSHBACK_2BIG = 2;
|
||||
|
||||
/* error messages for the codes above */
|
||||
private static final String ZZ_ERROR_MSG[] = {
|
||||
"Unkown internal scanner error",
|
||||
"Error: could not match input",
|
||||
"Error: pushback value was too large"
|
||||
};
|
||||
|
||||
/**
|
||||
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
|
||||
*/
|
||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\1\0\1\11\5\1\2\11\1\1\1\0\1\1\1\0"+
|
||||
"\1\1\2\0";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[16];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/** the input device */
|
||||
private java.io.Reader zzReader;
|
||||
|
||||
/** the current state of the DFA */
|
||||
private int zzState;
|
||||
|
||||
/** the current lexical state */
|
||||
private int zzLexicalState = YYINITIAL;
|
||||
|
||||
/** this buffer contains the current text to be matched and is
|
||||
the source of the yytext() string */
|
||||
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
|
||||
|
||||
/** the textposition at the last accepting state */
|
||||
private int zzMarkedPos;
|
||||
|
||||
/** the current text position in the buffer */
|
||||
private int zzCurrentPos;
|
||||
|
||||
/** startRead marks the beginning of the yytext() string in the buffer */
|
||||
private int zzStartRead;
|
||||
|
||||
/** endRead marks the last character in the buffer, that has been read
|
||||
from input */
|
||||
private int zzEndRead;
|
||||
|
||||
/** number of newlines encountered up to the start of the matched text */
|
||||
private int yyline;
|
||||
|
||||
/** the number of characters up to the start of the matched text */
|
||||
private int yychar;
|
||||
|
||||
/**
|
||||
* the number of characters from the last newline up to the start of the
|
||||
* matched text
|
||||
*/
|
||||
private int yycolumn;
|
||||
|
||||
/**
|
||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
||||
*/
|
||||
private boolean zzAtBOL = true;
|
||||
|
||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
||||
private boolean zzAtEOF;
|
||||
|
||||
/** denotes if the user-EOF-code has already been executed */
|
||||
private boolean zzEOFDone;
|
||||
|
||||
/* user code: */
|
||||
/** Alphanumeric sequences */
|
||||
public static final String WORD_TYPE = "<ALPHANUM>";
|
||||
|
||||
/** Numbers */
|
||||
public static final String NUMERIC_TYPE = "<NUM>";
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* <p>
|
||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||
*/
|
||||
public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
|
||||
|
||||
public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
|
||||
|
||||
public static final String HIRAGANA_TYPE = "<HIRAGANA>";
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt
|
||||
= addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
private int posIncr;
|
||||
|
||||
|
||||
/**
|
||||
* @param source The AttributeSource to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param factory The AttributeFactory to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max allowed token length. Any token longer than this is skipped.
|
||||
* @param length the new max allowed token length
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the max allowed token length. Any token longer than this is
|
||||
* skipped.
|
||||
* @return the max allowed token length
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(yychar + yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
yyreset(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// This method is required because of two JFlex limitations:
|
||||
// 1. No way to insert code at the beginning of the generated scanning
|
||||
// get-next-token method; and
|
||||
// 2. No way to declare @Override on the generated scanning method.
|
||||
clearAttributes();
|
||||
posIncr = 1;
|
||||
return getNextToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
|
||||
* the current match, the TypeAttribute from the passed-in tokenType, and
|
||||
* the PositionIncrementAttribute to one, unless the immediately previous
|
||||
* token(s) was/were skipped because maxTokenLength was exceeded, in which
|
||||
* case the PositionIncrementAttribute is set to one plus the number of
|
||||
* skipped overly long tokens.
|
||||
* <p/>
|
||||
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
|
||||
* and false is returned.
|
||||
*
|
||||
* @param tokenType The type of the matching token
|
||||
* @return true there is a token available (not too long); false otherwise
|
||||
*/
|
||||
private boolean populateAttributes(String tokenType) {
|
||||
boolean isTokenAvailable = false;
|
||||
if (yylength() > maxTokenLength) {
|
||||
// When we skip a too-long token, we treat it like a stopword, introducing
|
||||
// a position increment gap
|
||||
++posIncr;
|
||||
} else {
|
||||
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
offsetAtt.setOffset(correctOffset(yychar),
|
||||
correctOffset(yychar + yylength()));
|
||||
typeAtt.setType(tokenType);
|
||||
isTokenAvailable = true;
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new scanner
|
||||
* There is also a java.io.InputStream version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
public UAX29Tokenizer(java.io.Reader in) {
|
||||
super(in);
|
||||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
public UAX29Tokenizer(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
*
|
||||
* @param packed the packed character translation table
|
||||
* @return the unpacked character translation table
|
||||
*/
|
||||
private static char [] zzUnpackCMap(String packed) {
|
||||
char [] map = new char[0x10000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 2174) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Refills the input buffer.
|
||||
*
|
||||
* @return <code>false</code>, iff there was new input.
|
||||
*
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
private boolean zzRefill() throws java.io.IOException {
|
||||
|
||||
/* first: make room (if you can) */
|
||||
if (zzStartRead > 0) {
|
||||
System.arraycopy(zzBuffer, zzStartRead,
|
||||
zzBuffer, 0,
|
||||
zzEndRead-zzStartRead);
|
||||
|
||||
/* translate stored positions */
|
||||
zzEndRead-= zzStartRead;
|
||||
zzCurrentPos-= zzStartRead;
|
||||
zzMarkedPos-= zzStartRead;
|
||||
zzStartRead = 0;
|
||||
}
|
||||
|
||||
/* is the buffer big enough? */
|
||||
if (zzCurrentPos >= zzBuffer.length) {
|
||||
/* if not: blow it up */
|
||||
char newBuffer[] = new char[zzCurrentPos*2];
|
||||
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
||||
zzBuffer = newBuffer;
|
||||
}
|
||||
|
||||
/* finally: fill the buffer with new input */
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead,
|
||||
zzBuffer.length-zzEndRead);
|
||||
|
||||
if (numRead > 0) {
|
||||
zzEndRead+= numRead;
|
||||
return false;
|
||||
}
|
||||
// unlikely but not impossible: read 0 characters, but not at end of stream
|
||||
if (numRead == 0) {
|
||||
int c = zzReader.read();
|
||||
if (c == -1) {
|
||||
return true;
|
||||
} else {
|
||||
zzBuffer[zzEndRead++] = (char) c;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Closes the input stream.
|
||||
*/
|
||||
private final void yyclose() throws java.io.IOException {
|
||||
zzAtEOF = true; /* indicate end of file */
|
||||
zzEndRead = zzStartRead; /* invalidate buffer */
|
||||
|
||||
if (zzReader != null)
|
||||
zzReader.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resets the scanner to read from a new input stream.
|
||||
* Does not close the old reader.
|
||||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
*
|
||||
* Internal scan buffer is resized down to its initial length, if it has grown.
|
||||
*
|
||||
* @param reader the new input stream
|
||||
*/
|
||||
private final void yyreset(java.io.Reader reader) {
|
||||
zzReader = reader;
|
||||
zzAtBOL = true;
|
||||
zzAtEOF = false;
|
||||
zzEOFDone = false;
|
||||
zzEndRead = zzStartRead = 0;
|
||||
zzCurrentPos = zzMarkedPos = 0;
|
||||
yyline = yychar = yycolumn = 0;
|
||||
zzLexicalState = YYINITIAL;
|
||||
if (zzBuffer.length > ZZ_BUFFERSIZE)
|
||||
zzBuffer = new char[ZZ_BUFFERSIZE];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the current lexical state.
|
||||
*/
|
||||
private final int yystate() {
|
||||
return zzLexicalState;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Enters a new lexical state
|
||||
*
|
||||
* @param newState the new lexical state
|
||||
*/
|
||||
private final void yybegin(int newState) {
|
||||
zzLexicalState = newState;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the text matched by the current regular expression.
|
||||
*/
|
||||
private final String yytext() {
|
||||
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
*
|
||||
* @param pos the position of the character to fetch.
|
||||
* A value from 0 to yylength()-1.
|
||||
*
|
||||
* @return the character at position pos
|
||||
*/
|
||||
private final char yycharat(int pos) {
|
||||
return zzBuffer[zzStartRead+pos];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the length of the matched text region.
|
||||
*/
|
||||
private final int yylength() {
|
||||
return zzMarkedPos-zzStartRead;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reports an error that occured while scanning.
|
||||
*
|
||||
* In a wellformed scanner (no or only correct usage of
|
||||
* yypushback(int) and a match-all fallback rule) this method
|
||||
* will only be called with things that "Can't Possibly Happen".
|
||||
* If this method is called, something is seriously wrong
|
||||
* (e.g. a JFlex bug producing a faulty scanner etc.).
|
||||
*
|
||||
* Usual syntax/scanner level error handling should be done
|
||||
* in error fallback rules.
|
||||
*
|
||||
* @param errorCode the code of the errormessage to display
|
||||
*/
|
||||
private void zzScanError(int errorCode) {
|
||||
String message;
|
||||
try {
|
||||
message = ZZ_ERROR_MSG[errorCode];
|
||||
}
|
||||
catch (ArrayIndexOutOfBoundsException e) {
|
||||
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
|
||||
}
|
||||
|
||||
throw new Error(message);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pushes the specified amount of characters back into the input stream.
|
||||
*
|
||||
* They will be read again by then next call of the scanning method
|
||||
*
|
||||
* @param number the number of characters to be read again.
|
||||
* This number must not be greater than yylength()!
|
||||
*/
|
||||
private void yypushback(int number) {
|
||||
if ( number > yylength() )
|
||||
zzScanError(ZZ_PUSHBACK_2BIG);
|
||||
|
||||
zzMarkedPos -= number;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resumes scanning until the next regular expression is matched,
|
||||
* the end of input is encountered or an I/O-Error occurs.
|
||||
*
|
||||
* @return the next token
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
private boolean getNextToken() throws java.io.IOException {
|
||||
int zzInput;
|
||||
int zzAction;
|
||||
|
||||
// cached fields:
|
||||
int zzCurrentPosL;
|
||||
int zzMarkedPosL;
|
||||
int zzEndReadL = zzEndRead;
|
||||
char [] zzBufferL = zzBuffer;
|
||||
char [] zzCMapL = ZZ_CMAP;
|
||||
|
||||
int [] zzTransL = ZZ_TRANS;
|
||||
int [] zzRowMapL = ZZ_ROWMAP;
|
||||
int [] zzAttrL = ZZ_ATTRIBUTE;
|
||||
|
||||
while (true) {
|
||||
zzMarkedPosL = zzMarkedPos;
|
||||
|
||||
yychar+= zzMarkedPosL-zzStartRead;
|
||||
|
||||
zzAction = -1;
|
||||
|
||||
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
|
||||
|
||||
zzState = ZZ_LEXSTATE[zzLexicalState];
|
||||
|
||||
|
||||
zzForAction: {
|
||||
while (true) {
|
||||
|
||||
if (zzCurrentPosL < zzEndReadL)
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
else if (zzAtEOF) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
// store back cached positions
|
||||
zzCurrentPos = zzCurrentPosL;
|
||||
zzMarkedPos = zzMarkedPosL;
|
||||
boolean eof = zzRefill();
|
||||
// get translated positions and possibly new buffer
|
||||
zzCurrentPosL = zzCurrentPos;
|
||||
zzMarkedPosL = zzMarkedPos;
|
||||
zzBufferL = zzBuffer;
|
||||
zzEndReadL = zzEndRead;
|
||||
if (eof) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
}
|
||||
}
|
||||
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
|
||||
if (zzNext == -1) break zzForAction;
|
||||
zzState = zzNext;
|
||||
|
||||
int zzAttributes = zzAttrL[zzState];
|
||||
if ( (zzAttributes & 1) == 1 ) {
|
||||
zzAction = zzState;
|
||||
zzMarkedPosL = zzCurrentPosL;
|
||||
if ( (zzAttributes & 8) == 8 ) break zzForAction;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// store back cached position
|
||||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 5:
|
||||
{ if (populateAttributes(IDEOGRAPHIC_TYPE)) return true;
|
||||
}
|
||||
case 7: break;
|
||||
case 1:
|
||||
{ /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
||||
}
|
||||
case 8: break;
|
||||
case 3:
|
||||
{ if (populateAttributes(NUMERIC_TYPE)) return true;
|
||||
}
|
||||
case 9: break;
|
||||
case 6:
|
||||
{ if (populateAttributes(HIRAGANA_TYPE)) return true;
|
||||
}
|
||||
case 10: break;
|
||||
case 4:
|
||||
{ if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true;
|
||||
}
|
||||
case 11: break;
|
||||
case 2:
|
||||
{ if (populateAttributes(WORD_TYPE)) return true;
|
||||
}
|
||||
case 12: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
zzAtEOF = true;
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
zzScanError(ZZ_NO_MATCH);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -32,11 +32,14 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
* algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
|
||||
* URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
* <p/>
|
||||
* Tokens produced are of the following types:
|
||||
* <ul>
|
||||
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
|
||||
* <li><NUM>: A number</li>
|
||||
* <li><URL>: A URL</li>
|
||||
* <li><EMAIL>: An email address</li>
|
||||
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
|
||||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
|
@ -57,7 +60,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
%final
|
||||
%public
|
||||
%apiprivate
|
||||
%class UAX29Tokenizer
|
||||
%class UAX29URLEmailTokenizer
|
||||
%extends Tokenizer
|
||||
%type boolean
|
||||
%function getNextToken
|
||||
|
@ -67,7 +70,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
super(in);
|
||||
%init}
|
||||
|
||||
// WB4. X (Extend | Format)* --> X
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
|
@ -77,6 +80,85 @@ MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]
|
|||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||
// RFC-1738: Uniform Resource Locators (URL)
|
||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||
|
||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||
|
||||
URIunreserved = [-._~A-Za-z0-9]
|
||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||
URIsubDelims = [!$&'()*+,;=]
|
||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIport = ":" [0-9]{1,5}
|
||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||
|
||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||
|
||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||
HTTPpath = ("/" {HTTPsegment})*
|
||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||
|
||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||
FTPscheme = [fF][tT][pP] "://"
|
||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||
|
||||
FILEscheme = [fF][iI][lL][eE] "://"
|
||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||
|
||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||
|
||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||
// reminder that they are acceptable bracketed host forms.
|
||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final String WORD_TYPE = "<ALPHANUM>";
|
||||
|
@ -84,6 +166,12 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
|||
/** Numbers */
|
||||
public static final String NUMERIC_TYPE = "<NUM>";
|
||||
|
||||
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
|
||||
public static final String URL_TYPE = "<URL>";
|
||||
|
||||
/** E-mail addresses */
|
||||
public static final String EMAIL_TYPE = "<EMAIL";
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
|
@ -112,7 +200,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
|||
* @param source The AttributeSource to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeSource source, Reader input) {
|
||||
public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
@ -121,7 +209,7 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
|||
* @param factory The AttributeFactory to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29Tokenizer(AttributeFactory factory, Reader input) {
|
||||
public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
@ -201,17 +289,19 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
|||
|
||||
%%
|
||||
|
||||
// WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return false; }
|
||||
|
||||
{URL} { if (populateAttributes(URL_TYPE)) return true; }
|
||||
{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
|
||||
|
||||
// WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
|
@ -220,14 +310,14 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
|||
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
|
||||
|
||||
|
||||
// WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
|
@ -260,15 +350,15 @@ ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]
|
|||
//
|
||||
\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
|
||||
|
||||
// WB14. Any ÷ Any
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
|
||||
\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
|
||||
|
||||
|
||||
// WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -27,7 +27,10 @@
|
|||
as of Lucene 3.1, implements the Word Break rules from the Unicode Text
|
||||
Segmentation algorithm, as specified in
|
||||
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
Unlike <code>UAX29URLEmailTokenizer</code>, URLs and email addresses are
|
||||
<b>not</b> tokenized as single tokens, but are instead split up into
|
||||
tokens according to the UAX#29 word break rules.
|
||||
<br/>
|
||||
<code><a href="StandardAnalyzer">StandardAnalyzer</a></code> includes
|
||||
<code>StandardTokenizer</code>,
|
||||
<code><a href="StandardFilter">StandardFilter</a></code>,
|
||||
|
@ -46,13 +49,11 @@
|
|||
<code><a href="../../../../../../all/org/apache/lucene/analysis/LowerCaseFilter.html">LowerCaseFilter</a></code>
|
||||
and <code><a href="../../../../../../all/org/apache/lucene/analysis/StopFilter.html">StopFilter</a></code>.
|
||||
</li>
|
||||
<li><code><a href="UAX29Tokenizer.html">UAX29Tokenizer</a></code>:
|
||||
implements the Word Break rules from the Unicode Text Segmentation
|
||||
algorithm, as specified in
|
||||
<li><code><a href="UAX29URLEmailTokenizer.html">UAX29URLEmailTokenizer</a></code>:
|
||||
implements the Word Break rules from the Unicode Text Segmentation
|
||||
algorithm, as specified in
|
||||
<a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
Unlike <code>StandardTokenizer</code>, URLs and email addresses are
|
||||
<b>not</b> tokenized as single tokens, but are instead split up into
|
||||
tokens according to the UAX#29 word break rules.
|
||||
URLs and email addresses are also tokenized according to the relevant RFCs.
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
|
|
|
@ -2,21 +2,14 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -58,63 +51,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
/** Passes through tokens with type "<URL>" and blocks all other types. */
|
||||
private class URLFilter extends TokenFilter {
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public URLFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.URL]) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
}
|
||||
|
||||
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
|
||||
private class EmailFilter extends TokenFilter {
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public EmailFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
}
|
||||
|
||||
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||
TokenFilter filter = new URLFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
TokenFilter filter = new EmailFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
public void testArmenian() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
|
@ -261,138 +197,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testWikiURLs() throws Exception {
|
||||
Reader reader = null;
|
||||
String luceneResourcesWikiPage;
|
||||
try {
|
||||
reader = new InputStreamReader
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPage.html"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
luceneResourcesWikiPage = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != luceneResourcesWikiPage
|
||||
&& luceneResourcesWikiPage.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] urls;
|
||||
try {
|
||||
List<String> urlList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
urlList.add(line);
|
||||
}
|
||||
}
|
||||
urls = urlList.toArray(new String[urlList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != urls && urls.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(urlAnalyzer, luceneResourcesWikiPage, urls);
|
||||
}
|
||||
|
||||
public void testEmails() throws Exception {
|
||||
Reader reader = null;
|
||||
String randomTextWithEmails;
|
||||
try {
|
||||
reader = new InputStreamReader
|
||||
(getClass().getResourceAsStream("random.text.with.email.addresses.txt"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
randomTextWithEmails = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != randomTextWithEmails
|
||||
&& randomTextWithEmails.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] emails;
|
||||
try {
|
||||
List<String> emailList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
emailList.add(line);
|
||||
}
|
||||
}
|
||||
emails = emailList.toArray(new String[emailList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != emails && emails.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(emailAnalyzer, randomTextWithEmails, emails);
|
||||
}
|
||||
|
||||
public void testURLs() throws Exception {
|
||||
Reader reader = null;
|
||||
String randomTextWithURLs;
|
||||
try {
|
||||
reader = new InputStreamReader
|
||||
(getClass().getResourceAsStream("random.text.with.urls.txt"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
randomTextWithURLs = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != randomTextWithURLs
|
||||
&& randomTextWithURLs.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] urls;
|
||||
try {
|
||||
List<String> urlList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
urlList.add(line);
|
||||
}
|
||||
}
|
||||
urls = urlList.toArray(new String[urlList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != urls && urls.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(urlAnalyzer, randomTextWithURLs, urls);
|
||||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
||||
wordBreakTest.test(a);
|
||||
|
|
|
@ -2,14 +2,21 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -28,7 +35,7 @@ import java.util.Arrays;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
||||
public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testHugeDoc() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -37,7 +44,7 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
|
@ -46,11 +53,70 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents
|
||||
(String fieldName, Reader reader) {
|
||||
|
||||
Tokenizer tokenizer = new UAX29Tokenizer(reader);
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** Passes through tokens with type "<URL>" and blocks all other types. */
|
||||
private class URLFilter extends TokenFilter {
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public URLFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
}
|
||||
|
||||
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
|
||||
private class EmailFilter extends TokenFilter {
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public EmailFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
@Override
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return isTokenAvailable;
|
||||
}
|
||||
}
|
||||
|
||||
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||
TokenFilter filter = new URLFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
TokenFilter filter = new EmailFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public void testArmenian() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
|
@ -163,7 +229,6 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
|||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
}
|
||||
|
||||
public void testTextWithNumbersSA() throws Exception {
|
||||
|
@ -197,6 +262,140 @@ public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testWikiURLs() throws Exception {
|
||||
Reader reader = null;
|
||||
String luceneResourcesWikiPage;
|
||||
try {
|
||||
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||
("LuceneResourcesWikiPage.html"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
luceneResourcesWikiPage = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != luceneResourcesWikiPage
|
||||
&& luceneResourcesWikiPage.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] urls;
|
||||
try {
|
||||
List<String> urlList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
urlList.add(line);
|
||||
}
|
||||
}
|
||||
urls = urlList.toArray(new String[urlList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != urls && urls.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(urlAnalyzer, luceneResourcesWikiPage, urls);
|
||||
}
|
||||
|
||||
public void testEmails() throws Exception {
|
||||
Reader reader = null;
|
||||
String randomTextWithEmails;
|
||||
try {
|
||||
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||
("random.text.with.email.addresses.txt"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
randomTextWithEmails = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != randomTextWithEmails
|
||||
&& randomTextWithEmails.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] emails;
|
||||
try {
|
||||
List<String> emailList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream
|
||||
("email.addresses.from.random.text.with.email.addresses.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
emailList.add(line);
|
||||
}
|
||||
}
|
||||
emails = emailList.toArray(new String[emailList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != emails && emails.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(emailAnalyzer, randomTextWithEmails, emails);
|
||||
}
|
||||
|
||||
public void testURLs() throws Exception {
|
||||
Reader reader = null;
|
||||
String randomTextWithURLs;
|
||||
try {
|
||||
reader = new InputStreamReader(getClass().getResourceAsStream
|
||||
("random.text.with.urls.txt"), "UTF-8");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
char[] buffer = new char[1024];
|
||||
int numCharsRead;
|
||||
while (-1 != (numCharsRead = reader.read(buffer))) {
|
||||
builder.append(buffer, 0, numCharsRead);
|
||||
}
|
||||
randomTextWithURLs = builder.toString();
|
||||
} finally {
|
||||
if (null != reader) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != randomTextWithURLs
|
||||
&& randomTextWithURLs.length() > 0);
|
||||
BufferedReader bufferedReader = null;
|
||||
String[] urls;
|
||||
try {
|
||||
List<String> urlList = new ArrayList<String>();
|
||||
bufferedReader = new BufferedReader(new InputStreamReader
|
||||
(getClass().getResourceAsStream
|
||||
("urls.from.random.text.with.urls.txt"), "UTF-8"));
|
||||
String line;
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
line = line.trim();
|
||||
if (line.length() > 0) {
|
||||
urlList.add(line);
|
||||
}
|
||||
}
|
||||
urls = urlList.toArray(new String[urlList.size()]);
|
||||
} finally {
|
||||
if (null != bufferedReader) {
|
||||
bufferedReader.close();
|
||||
}
|
||||
}
|
||||
assertTrue(null != urls && urls.length > 0);
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo
|
||||
(urlAnalyzer, randomTextWithURLs, urls);
|
||||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
||||
wordBreakTest.test(a);
|
|
@ -123,7 +123,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz@demo.com" });
|
||||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
|
||||
}
|
||||
|
||||
/** @deprecated (3.1) for version back compat */
|
||||
|
|
|
@ -302,8 +302,10 @@ New Features
|
|||
* SOLR-1336: Add SmartChinese (word segmentation for Simplified Chinese)
|
||||
tokenizer and filters to contrib/analysis-extras (rmuir)
|
||||
|
||||
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm
|
||||
with good results for most languages. (Tom Burton-West via rmuir)
|
||||
* SOLR-2211,LUCENE-2763: Added UAX29URLEmailTokenizerFactory, which implements
|
||||
UAX#29, a unicode algorithm with good results for most languages, as well as
|
||||
URL and E-mail tokenization according to the relevant RFCs.
|
||||
(Tom Burton-West via rmuir)
|
||||
|
||||
* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.solr.analysis;
|
|||
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.standard.UAX29Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
@ -30,14 +30,14 @@ import java.util.Map;
|
|||
*
|
||||
*/
|
||||
|
||||
public class UAX29TokenizerFactory extends BaseTokenizerFactory {
|
||||
public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public UAX29Tokenizer create(Reader input) {
|
||||
return new UAX29Tokenizer(input);
|
||||
public UAX29URLEmailTokenizer create(Reader input) {
|
||||
return new UAX29URLEmailTokenizer(input);
|
||||
}
|
||||
}
|
|
@ -1,81 +0,0 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* A few tests based on org.apache.lucene.analysis.TestUAX29Tokenizer;
|
||||
*/
|
||||
|
||||
public class TestUAX29TokenizerFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test UAX29TokenizerFactory
|
||||
*/
|
||||
public void testUAX29Tokenizer() throws Exception {
|
||||
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"Wha\u0301t's", "this", "thing", "do" });
|
||||
}
|
||||
|
||||
public void testArabic() throws Exception {
|
||||
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
|
||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
|
||||
}
|
||||
|
||||
public void testChinese() throws Exception {
|
||||
Reader reader = new StringReader("我是中国人。 1234 Tests ");
|
||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
|
||||
}
|
||||
public void testKorean() throws Exception {
|
||||
Reader reader = new StringReader("안녕하세요 한글입니다");
|
||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
public void testHyphen() throws Exception {
|
||||
Reader reader = new StringReader("some-dashed-phrase");
|
||||
UAX29TokenizerFactory factory = new UAX29TokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"some", "dashed", "phrase"});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer
|
||||
*/
|
||||
|
||||
public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
|
||||
|
||||
public void testUAX29URLEmailTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("Wha\u0301t's this thing do?");
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"Wha\u0301t's", "this", "thing", "do" });
|
||||
}
|
||||
|
||||
public void testArabic() throws Exception {
|
||||
Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" });
|
||||
}
|
||||
|
||||
public void testChinese() throws Exception {
|
||||
Reader reader = new StringReader("我是中国人。 1234 Tests ");
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
|
||||
}
|
||||
|
||||
public void testKorean() throws Exception {
|
||||
Reader reader = new StringReader("안녕하세요 한글입니다");
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
public void testHyphen() throws Exception {
|
||||
Reader reader = new StringReader("some-dashed-phrase");
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"some", "dashed", "phrase"});
|
||||
}
|
||||
|
||||
// Test with some URLs from TestUAX29URLEmailTokenizer's
|
||||
// urls.from.random.text.with.urls.txt
|
||||
public void testURLs() throws Exception {
|
||||
String textWithURLs
|
||||
= "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on\n"
|
||||
+ " some extra\nWords thrown in here. "
|
||||
+ "http://c5-3486.bisynxu.FR/aI.YnNms/"
|
||||
+ " samba Halta gamba "
|
||||
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
|
||||
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
|
||||
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
|
||||
+ " inter Locutio "
|
||||
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
|
||||
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
|
||||
+ " blah Sirrah woof "
|
||||
+ "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
|
||||
Reader reader = new StringReader(textWithURLs);
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {
|
||||
"http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on",
|
||||
"some", "extra", "Words", "thrown", "in", "here",
|
||||
"http://c5-3486.bisynxu.FR/aI.YnNms/",
|
||||
"samba", "Halta", "gamba",
|
||||
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
|
||||
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
|
||||
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
|
||||
"inter", "Locutio",
|
||||
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
|
||||
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
|
||||
"blah", "Sirrah", "woof",
|
||||
"http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// Test with some emails from TestUAX29URLEmailTokenizer's
|
||||
// email.addresses.from.random.text.with.email.addresses.txt
|
||||
public void testEmails() throws Exception {
|
||||
String textWithEmails
|
||||
= " some extra\nWords thrown in here. "
|
||||
+ "dJ8ngFi@avz13m.CC\n"
|
||||
+ "kU-l6DS@[082.015.228.189]\n"
|
||||
+ "\"%U\u0012@?\\B\"@Fl2d.md"
|
||||
+ " samba Halta gamba "
|
||||
+ "Bvd#@tupjv.sn\n"
|
||||
+ "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n"
|
||||
+ "~+Kdz@3mousnl.SE\n"
|
||||
+ " inter Locutio "
|
||||
+ "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n"
|
||||
+ "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM"
|
||||
+ " blah Sirrah woof "
|
||||
+ "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n"
|
||||
+ "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n";
|
||||
Reader reader = new StringReader(textWithEmails);
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {
|
||||
"some", "extra", "Words", "thrown", "in", "here",
|
||||
"dJ8ngFi@avz13m.CC",
|
||||
"kU-l6DS@[082.015.228.189]",
|
||||
"\"%U\u0012@?\\B\"@Fl2d.md",
|
||||
"samba", "Halta", "gamba",
|
||||
"Bvd#@tupjv.sn",
|
||||
"SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt",
|
||||
"~+Kdz@3mousnl.SE",
|
||||
"inter", "Locutio",
|
||||
"C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY",
|
||||
"}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM",
|
||||
"blah", "Sirrah", "woof",
|
||||
"lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae",
|
||||
"lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H"
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue