LUCENE-3880: UAX29URLEmailTokenizer now recognizes emails when the mailto: scheme is prepended.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1302265 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2012-03-19 03:13:52 +00:00
parent 1991b61b49
commit c4f72f61ac
11 changed files with 7835 additions and 2853 deletions

View File

@ -972,6 +972,9 @@ Bug fixes
* LUCENE-3876: Fix bug where positions for a document exceeding
Integer.MAX_VALUE/2 would produce a corrupt index.
(Simon Willnauer, Mike Mccandless, Robert Muir)
* LUCENE-3880: UAX29URLEmailTokenizer now recognizes emails when the mailto:
scheme is prepended. (Kai Gülzau, Steve Rowe)
Optimizations

View File

@ -98,6 +98,9 @@
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std31"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/std34/UAX29URLEmailTokenizerImpl34.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std34"
nobak="on" />
</target>
<target name="clean-jflex">

View File

@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Thursday, August 4, 2011 11:34:20 AM UTC
// generated on Thursday, August 4, 2011 11:46:19 PM UTC
// file version from Sunday, March 18, 2012 4:34:02 AM UTC
// generated on Sunday, March 18, 2012 4:02:55 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
@ -79,6 +79,7 @@ ASCIITLD = "." (
| [cC][rR]
| [cC][uU]
| [cC][vV]
| [cC][wW]
| [cC][xX]
| [cC][yY]
| [cC][zZ]
@ -247,6 +248,7 @@ ASCIITLD = "." (
| [sS][tT]
| [sS][uU]
| [sS][vV]
| [sS][xX]
| [sS][yY]
| [sS][zZ]
| [tT][cC]
@ -288,6 +290,7 @@ ASCIITLD = "." (
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--80[aA][oO]21[aA]
| [xX][nN]--90[aA]3[aA][cC]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]

View File

@ -23,8 +23,8 @@ import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
import org.apache.lucene.analysis.standard.std31.UAX29URLEmailTokenizerImpl31;
import org.apache.lucene.analysis.standard.std34.UAX29URLEmailTokenizerImpl34;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -128,8 +128,10 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
}
private static StandardTokenizerInterface getScannerFor(Version matchVersion, Reader input) {
if (matchVersion.onOrAfter(Version.LUCENE_34)) {
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
return new UAX29URLEmailTokenizerImpl(input);
} else if (matchVersion.onOrAfter(Version.LUCENE_34)) {
return new UAX29URLEmailTokenizerImpl34(input);
} else {
return new UAX29URLEmailTokenizerImpl31(input);
}

View File

@ -205,6 +205,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
// LUCENE-3880: Disrupt recognition of "mailto:test" as <ALPHANUM> from "mailto:test@example.org"
[mM][aA][iI][lL][tT][oO] / ":" {EMAIL} { return WORD_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric

View File

@ -0,0 +1,334 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Thursday, August 4, 2011 11:34:20 AM UTC
// generated on Thursday, August 4, 2011 11:46:19 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
[aA][cC]
| [aA][dD]
| [aA][eE]
| [aA][eE][rR][oO]
| [aA][fF]
| [aA][gG]
| [aA][iI]
| [aA][lL]
| [aA][mM]
| [aA][nN]
| [aA][oO]
| [aA][qQ]
| [aA][rR]
| [aA][rR][pP][aA]
| [aA][sS]
| [aA][sS][iI][aA]
| [aA][tT]
| [aA][uU]
| [aA][wW]
| [aA][xX]
| [aA][zZ]
| [bB][aA]
| [bB][bB]
| [bB][dD]
| [bB][eE]
| [bB][fF]
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
| [bB][nN]
| [bB][oO]
| [bB][rR]
| [bB][sS]
| [bB][tT]
| [bB][vV]
| [bB][wW]
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
| [cC][fF]
| [cC][gG]
| [cC][hH]
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
| [cC][vV]
| [cC][xX]
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
| [dD][oO]
| [dD][zZ]
| [eE][cC]
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][rR]
| [eE][sS]
| [eE][tT]
| [eE][uU]
| [fF][iI]
| [fF][jJ]
| [fF][kK]
| [fF][mM]
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][bB]
| [gG][dD]
| [gG][eE]
| [gG][fF]
| [gG][gG]
| [gG][hH]
| [gG][iI]
| [gG][lL]
| [gG][mM]
| [gG][nN]
| [gG][oO][vV]
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][rR]
| [hH][tT]
| [hH][uU]
| [iI][dD]
| [iI][eE]
| [iI][lL]
| [iI][mM]
| [iI][nN]
| [iI][nN][fF][oO]
| [iI][nN][tT]
| [iI][oO]
| [iI][qQ]
| [iI][rR]
| [iI][sS]
| [iI][tT]
| [jJ][eE]
| [jJ][mM]
| [jJ][oO]
| [jJ][oO][bB][sS]
| [jJ][pP]
| [kK][eE]
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][mM]
| [kK][nN]
| [kK][pP]
| [kK][rR]
| [kK][wW]
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][kK]
| [lL][rR]
| [lL][sS]
| [lL][tT]
| [lL][uU]
| [lL][vV]
| [lL][yY]
| [mM][aA]
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
| [mM][kK]
| [mM][lL]
| [mM][mM]
| [mM][nN]
| [mM][oO]
| [mM][oO][bB][iI]
| [mM][pP]
| [mM][qQ]
| [mM][rR]
| [mM][sS]
| [mM][tT]
| [mM][uU]
| [mM][uU][sS][eE][uU][mM]
| [mM][vV]
| [mM][wW]
| [mM][xX]
| [mM][yY]
| [mM][zZ]
| [nN][aA]
| [nN][aA][mM][eE]
| [nN][cC]
| [nN][eE]
| [nN][eE][tT]
| [nN][fF]
| [nN][gG]
| [nN][iI]
| [nN][lL]
| [nN][oO]
| [nN][pP]
| [nN][rR]
| [nN][uU]
| [nN][zZ]
| [oO][mM]
| [oO][rR][gG]
| [pP][aA]
| [pP][eE]
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][kK]
| [pP][lL]
| [pP][mM]
| [pP][nN]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
| [pP][tT]
| [pP][wW]
| [pP][yY]
| [qQ][aA]
| [rR][eE]
| [rR][oO]
| [rR][sS]
| [rR][uU]
| [rR][wW]
| [sS][aA]
| [sS][bB]
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
| [sS][mM]
| [sS][nN]
| [sS][oO]
| [sS][rR]
| [sS][tT]
| [sS][uU]
| [sS][vV]
| [sS][yY]
| [sS][zZ]
| [tT][cC]
| [tT][dD]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
| [tT][tT]
| [tT][vV]
| [tT][wW]
| [tT][zZ]
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--90[aA]3[aA][cC]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[wW][gG][bB][lL]6[aA]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [xX][xX][xX]
| [yY][eE]
| [yY][tT]
| [zZ][aA]
| [zZ][mM]
| [zZ][wW]
) "."? // Accept trailing root (empty) domain

View File

@ -0,0 +1,125 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
ExtendSupp = (
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud804][\uDC66-\uDC6F])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
MidLetterSupp = (
[]
)
MidNumSupp = (
[]
)
MidNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ComplexContextSupp = (
[]
)
HanSupp = (
([\ud87e][\uDC00-\uDE1D])
| ([\ud86b][\uDC00-\uDFFF])
| ([\ud86a][\uDC00-\uDFFF])
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
| ([\ud868][\uDC00-\uDFFF])
| ([\ud86e][\uDC00-\uDC1D])
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
| ([\ud86c][\uDC00-\uDFFF])
| ([\ud863][\uDC00-\uDFFF])
| ([\ud862][\uDC00-\uDFFF])
| ([\ud861][\uDC00-\uDFFF])
| ([\ud860][\uDC00-\uDFFF])
| ([\ud867][\uDC00-\uDFFF])
| ([\ud866][\uDC00-\uDFFF])
| ([\ud865][\uDC00-\uDFFF])
| ([\ud864][\uDC00-\uDFFF])
| ([\ud858][\uDC00-\uDFFF])
| ([\ud859][\uDC00-\uDFFF])
| ([\ud85a][\uDC00-\uDFFF])
| ([\ud85b][\uDC00-\uDFFF])
| ([\ud85c][\uDC00-\uDFFF])
| ([\ud85d][\uDC00-\uDFFF])
| ([\ud85e][\uDC00-\uDFFF])
| ([\ud85f][\uDC00-\uDFFF])
| ([\ud850][\uDC00-\uDFFF])
| ([\ud851][\uDC00-\uDFFF])
| ([\ud852][\uDC00-\uDFFF])
| ([\ud853][\uDC00-\uDFFF])
| ([\ud854][\uDC00-\uDFFF])
| ([\ud855][\uDC00-\uDFFF])
| ([\ud856][\uDC00-\uDFFF])
| ([\ud857][\uDC00-\uDFFF])
| ([\ud849][\uDC00-\uDFFF])
| ([\ud848][\uDC00-\uDFFF])
| ([\ud84b][\uDC00-\uDFFF])
| ([\ud84a][\uDC00-\uDFFF])
| ([\ud84d][\uDC00-\uDFFF])
| ([\ud84c][\uDC00-\uDFFF])
| ([\ud84f][\uDC00-\uDFFF])
| ([\ud84e][\uDC00-\uDFFF])
| ([\ud841][\uDC00-\uDFFF])
| ([\ud840][\uDC00-\uDFFF])
| ([\ud843][\uDC00-\uDFFF])
| ([\ud842][\uDC00-\uDFFF])
| ([\ud845][\uDC00-\uDFFF])
| ([\ud844][\uDC00-\uDFFF])
| ([\ud847][\uDC00-\uDFFF])
| ([\ud846][\uDC00-\uDFFF])
)
HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)

View File

@ -0,0 +1,272 @@
package org.apache.lucene.analysis.standard.std34;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements UAX29URLEmailTokenizer, except with a bug
* (https://issues.apache.org/jira/browse/LUCENE-3880) where "mailto:"
* URI scheme prepended to an email address will disrupt recognition
* of the email address.
* @deprecated This class is only for exact backwards compatibility
*/
@Deprecated
%%
%unicode 6.0
%integer
%final
%public
%class UAX29URLEmailTokenizerImpl34
%implements StandardTokenizerInterface
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{HanEx} { return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -352,6 +352,31 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
(emailAnalyzer, randomTextWithEmails, emails);
}
public void testMailtoSchemeEmails () throws Exception {
// See LUCENE-3880
BaseTokenStreamTestCase.assertAnalyzesTo(a, "mailto:test@example.org",
new String[] {"mailto", "test@example.org"},
new String[] { "<ALPHANUM>", "<EMAIL>" });
// TODO: Support full mailto: scheme URIs. See RFC 6068: http://tools.ietf.org/html/rfc6068
BaseTokenStreamTestCase.assertAnalyzesTo
(a, "mailto:personA@example.com,personB@example.com?cc=personC@example.com"
+ "&subject=Subjectivity&body=Corpusivity%20or%20something%20like%20that",
new String[] { "mailto",
"personA@example.com",
// TODO: recognize ',' address delimiter. Also, see examples of ';' delimiter use at: http://www.mailto.co.uk/
",personB@example.com",
"?cc=personC@example.com", // TODO: split field keys/values
"subject", "Subjectivity",
"body", "Corpusivity", "20or", "20something","20like", "20that" }, // TODO: Hex decoding + re-tokenization
new String[] { "<ALPHANUM>",
"<EMAIL>",
"<EMAIL>",
"<EMAIL>",
"<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
}
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;