LUCENE-3361: port url+email tokenizer to standardtokenizerinterface, fix combining marks bug

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1154936 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-08-08 11:57:59 +00:00
parent 718f42479f
commit ef56f5d551
14 changed files with 8428 additions and 3839 deletions

View File

@ -542,10 +542,11 @@ Bug fixes
* LUCENE-3340: Fixed case where IndexWriter was not flushing at
exactly maxBufferedDeleteTerms (Mike McCandless)
* LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached
to Han or Hiragana characters, this is fixed if you supply Version >= 3.4
If you supply a previous lucene version, you get the old buggy behavior
for backwards compatibility. (Trejkaz, Robert Muir)
* LUCENE-3358, LUCENE-3361: StandardTokenizer and UAX29URLEmailTokenizer
wrongly discarded combining marks attached to Han or Hiragana characters,
this is fixed if you supply Version >= 3.4 If you supply a previous
lucene version, you get the old buggy behavior for backwards compatibility.
(Trejkaz, Robert Muir)
New Features

View File

@ -56,7 +56,7 @@
nobak="on"/>
</target>
<target name="jflex-StandardAnalyzer" depends="init,jflex-check,gen-tlds" if="jflex.present">
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
@ -76,9 +76,12 @@
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/analysis/standard"
nobak="on" />
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
outdir="src/java/org/apache/lucene/analysis/standard/std31"
nobak="on" />
</target>
<target name="clean-jflex">
@ -86,7 +89,7 @@
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>
<fileset dir="src/java/org/apache/lucene/analysis/standard" includes="*.java">
<fileset dir="src/java/org/apache/lucene/analysis/standard" includes="**/*.java">
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>
</delete>

View File

@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
// file version from Thursday, August 4, 2011 11:34:20 AM UTC
// generated on Thursday, August 4, 2011 11:46:19 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
@ -288,6 +288,7 @@ ASCIITLD = "." (
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--90[aA]3[aA][cC]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
@ -305,9 +306,11 @@ ASCIITLD = "." (
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
@ -321,6 +324,7 @@ ASCIITLD = "." (
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [xX][xX][xX]
| [yY][eE]
| [yY][tT]
| [zZ][aA]

View File

@ -17,16 +17,7 @@ package org.apache.lucene.analysis.standard;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* This class implements Word Break rules from the Unicode Text Segmentation
@ -49,20 +40,14 @@ import org.apache.lucene.util.AttributeSource;
%%
%unicode 6.0
%integer
%final
%public
%apiprivate
%class UAX29URLEmailTokenizer
%extends Tokenizer
%type boolean
%class UAX29URLEmailTokenizerImpl
%implements StandardTokenizerInterface
%function getNextToken
%char
%init{
super(in);
%init}
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
@ -89,6 +74,8 @@ MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
@ -170,16 +157,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
/** Numbers */
public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final String URL_TYPE = "<URL>";
/** E-mail addresses */
public static final String EMAIL_TYPE = "<EMAIL>";
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@ -189,114 +170,30 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
= addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
private int posIncr;
/**
* @param source The AttributeSource to use
* @param input The input reader
*/
public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
super(source, input);
zzReader = input;
}
/**
* @param factory The AttributeFactory to use
* @param input The input reader
*/
public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
zzReader = input;
}
/**
* Set the max allowed token length. Any token longer than this is skipped.
* @param length the new max allowed token length
*/
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
public final int yychar()
{
return yychar;
}
/**
* Returns the max allowed token length. Any token longer than this is
* skipped.
* @return the max allowed token length
* Fills CharTermAttribute with the current token text.
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(yychar + yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader reader) throws IOException {
super.reset(reader);
yyreset(reader);
}
@Override
public final boolean incrementToken() throws IOException {
// This method is required because of two JFlex limitations:
// 1. No way to insert code at the beginning of the generated scanning
// get-next-token method; and
// 2. No way to declare @Override on the generated scanning method.
clearAttributes();
posIncr = 1;
return getNextToken();
}
/**
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
* the current match, the TypeAttribute from the passed-in tokenType, and
* the PositionIncrementAttribute to one, unless the immediately previous
* token(s) was/were skipped because maxTokenLength was exceeded, in which
* case the PositionIncrementAttribute is set to one plus the number of
* skipped overly long tokens.
* <p/>
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
* and false is returned.
*
* @param tokenType The type of the matching token
* @return true there is a token available (not too long); false otherwise
*/
private boolean populateAttributes(String tokenType) {
boolean isTokenAvailable = false;
if (yylength() > maxTokenLength) {
// When we skip a too-long token, we treat it like a stopword, introducing
// a position increment gap
++posIncr;
} else {
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
posIncrAtt.setPositionIncrement(posIncr);
offsetAtt.setOffset(correctOffset(yychar),
correctOffset(yychar + yylength()));
typeAtt.setType(tokenType);
isTokenAvailable = true;
}
return isTokenAvailable;
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
@ -305,10 +202,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return false; }
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { if (populateAttributes(URL_TYPE)) return true; }
{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
@ -320,14 +217,14 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ if (populateAttributes(HANGUL_TYPE)) return true; }
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ if (populateAttributes(KATAKANA_TYPE)) return true; }
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
@ -345,7 +242,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ if (populateAttributes(WORD_TYPE)) return true; }
{ return WORD_TYPE; }
// From UAX #29:
@ -367,12 +264,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
{HanEx} { return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF

View File

@ -0,0 +1,330 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
[aA][cC]
| [aA][dD]
| [aA][eE]
| [aA][eE][rR][oO]
| [aA][fF]
| [aA][gG]
| [aA][iI]
| [aA][lL]
| [aA][mM]
| [aA][nN]
| [aA][oO]
| [aA][qQ]
| [aA][rR]
| [aA][rR][pP][aA]
| [aA][sS]
| [aA][sS][iI][aA]
| [aA][tT]
| [aA][uU]
| [aA][wW]
| [aA][xX]
| [aA][zZ]
| [bB][aA]
| [bB][bB]
| [bB][dD]
| [bB][eE]
| [bB][fF]
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
| [bB][nN]
| [bB][oO]
| [bB][rR]
| [bB][sS]
| [bB][tT]
| [bB][vV]
| [bB][wW]
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
| [cC][fF]
| [cC][gG]
| [cC][hH]
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
| [cC][vV]
| [cC][xX]
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
| [dD][oO]
| [dD][zZ]
| [eE][cC]
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][rR]
| [eE][sS]
| [eE][tT]
| [eE][uU]
| [fF][iI]
| [fF][jJ]
| [fF][kK]
| [fF][mM]
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][bB]
| [gG][dD]
| [gG][eE]
| [gG][fF]
| [gG][gG]
| [gG][hH]
| [gG][iI]
| [gG][lL]
| [gG][mM]
| [gG][nN]
| [gG][oO][vV]
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][rR]
| [hH][tT]
| [hH][uU]
| [iI][dD]
| [iI][eE]
| [iI][lL]
| [iI][mM]
| [iI][nN]
| [iI][nN][fF][oO]
| [iI][nN][tT]
| [iI][oO]
| [iI][qQ]
| [iI][rR]
| [iI][sS]
| [iI][tT]
| [jJ][eE]
| [jJ][mM]
| [jJ][oO]
| [jJ][oO][bB][sS]
| [jJ][pP]
| [kK][eE]
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][mM]
| [kK][nN]
| [kK][pP]
| [kK][rR]
| [kK][wW]
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][kK]
| [lL][rR]
| [lL][sS]
| [lL][tT]
| [lL][uU]
| [lL][vV]
| [lL][yY]
| [mM][aA]
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
| [mM][kK]
| [mM][lL]
| [mM][mM]
| [mM][nN]
| [mM][oO]
| [mM][oO][bB][iI]
| [mM][pP]
| [mM][qQ]
| [mM][rR]
| [mM][sS]
| [mM][tT]
| [mM][uU]
| [mM][uU][sS][eE][uU][mM]
| [mM][vV]
| [mM][wW]
| [mM][xX]
| [mM][yY]
| [mM][zZ]
| [nN][aA]
| [nN][aA][mM][eE]
| [nN][cC]
| [nN][eE]
| [nN][eE][tT]
| [nN][fF]
| [nN][gG]
| [nN][iI]
| [nN][lL]
| [nN][oO]
| [nN][pP]
| [nN][rR]
| [nN][uU]
| [nN][zZ]
| [oO][mM]
| [oO][rR][gG]
| [pP][aA]
| [pP][eE]
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][kK]
| [pP][lL]
| [pP][mM]
| [pP][nN]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
| [pP][tT]
| [pP][wW]
| [pP][yY]
| [qQ][aA]
| [rR][eE]
| [rR][oO]
| [rR][sS]
| [rR][uU]
| [rR][wW]
| [sS][aA]
| [sS][bB]
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
| [sS][mM]
| [sS][nN]
| [sS][oO]
| [sS][rR]
| [sS][tT]
| [sS][uU]
| [sS][vV]
| [sS][yY]
| [sS][zZ]
| [tT][cC]
| [tT][dD]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
| [tT][tT]
| [tT][vV]
| [tT][wW]
| [tT][zZ]
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[wW][gG][bB][lL]6[aA]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [yY][eE]
| [yY][tT]
| [zZ][aA]
| [zZ][mM]
| [zZ][wW]
) "."? // Accept trailing root (empty) domain

View File

@ -0,0 +1,125 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 4.6.0.0 on Wednesday, February 9, 2011 4:45:11 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
ExtendSupp = (
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud804][\uDC66-\uDC6F])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
MidLetterSupp = (
[]
)
MidNumSupp = (
[]
)
MidNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ComplexContextSupp = (
[]
)
HanSupp = (
([\ud87e][\uDC00-\uDE1D])
| ([\ud86b][\uDC00-\uDFFF])
| ([\ud86a][\uDC00-\uDFFF])
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
| ([\ud868][\uDC00-\uDFFF])
| ([\ud86e][\uDC00-\uDC1D])
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
| ([\ud86c][\uDC00-\uDFFF])
| ([\ud863][\uDC00-\uDFFF])
| ([\ud862][\uDC00-\uDFFF])
| ([\ud861][\uDC00-\uDFFF])
| ([\ud860][\uDC00-\uDFFF])
| ([\ud867][\uDC00-\uDFFF])
| ([\ud866][\uDC00-\uDFFF])
| ([\ud865][\uDC00-\uDFFF])
| ([\ud864][\uDC00-\uDFFF])
| ([\ud858][\uDC00-\uDFFF])
| ([\ud859][\uDC00-\uDFFF])
| ([\ud85a][\uDC00-\uDFFF])
| ([\ud85b][\uDC00-\uDFFF])
| ([\ud85c][\uDC00-\uDFFF])
| ([\ud85d][\uDC00-\uDFFF])
| ([\ud85e][\uDC00-\uDFFF])
| ([\ud85f][\uDC00-\uDFFF])
| ([\ud850][\uDC00-\uDFFF])
| ([\ud851][\uDC00-\uDFFF])
| ([\ud852][\uDC00-\uDFFF])
| ([\ud853][\uDC00-\uDFFF])
| ([\ud854][\uDC00-\uDFFF])
| ([\ud855][\uDC00-\uDFFF])
| ([\ud856][\uDC00-\uDFFF])
| ([\ud857][\uDC00-\uDFFF])
| ([\ud849][\uDC00-\uDFFF])
| ([\ud848][\uDC00-\uDFFF])
| ([\ud84b][\uDC00-\uDFFF])
| ([\ud84a][\uDC00-\uDFFF])
| ([\ud84d][\uDC00-\uDFFF])
| ([\ud84c][\uDC00-\uDFFF])
| ([\ud84f][\uDC00-\uDFFF])
| ([\ud84e][\uDC00-\uDFFF])
| ([\ud841][\uDC00-\uDFFF])
| ([\ud840][\uDC00-\uDFFF])
| ([\ud843][\uDC00-\uDFFF])
| ([\ud842][\uDC00-\uDFFF])
| ([\ud845][\uDC00-\uDFFF])
| ([\ud844][\uDC00-\uDFFF])
| ([\ud847][\uDC00-\uDFFF])
| ([\ud846][\uDC00-\uDFFF])
)
HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)

View File

@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})

View File

@ -0,0 +1,269 @@
package org.apache.lucene.analysis.standard.std31;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements UAX29URLEmailTokenizer, except with a bug
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
* characters would be split from combining characters:
* @deprecated This class is only for exact backwards compatibility
*/
@Deprecated
%%
%unicode 6.0
%integer
%final
%public
%class UAX29URLEmailTokenizerImpl31
%implements StandardTokenizerInterface
%function getNextToken
%char
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
// RFC-1123: Requirements for Internet Hosts - Application and Support
// RFC-1738: Uniform Resource Locators (URL)
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
URIunreserved = [-._~A-Za-z0-9]
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
URIsubDelims = [!$&'()*+,;=]
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
URIport = ":" [0-9]{1,5}
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
URIauthorityStrict = {URIhostStrict} {URIport}?
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
HTTPpath = ("/" {HTTPsegment})*
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
FTPscheme = [fF][tT][pP] "://"
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
FILEscheme = [fF][iI][lL][eE] "://"
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
URL = {HTTPurl} | {FTPurl} | {FILEurl}
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
// in the {EMAILbracketedHost} definition without incurring any size penalties,
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
// The IP address regexes are included in {EMAILbracketedHost} simply as a
// reminder that they are acceptable bracketed host forms.
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
{URL} { return URL_TYPE; }
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{Han} { return IDEOGRAPHIC_TYPE; }
{Hiragana} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB14. Any ÷ Any
//
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -5,9 +5,11 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.Version;
import java.io.BufferedReader;
import java.io.IOException;
@ -44,7 +46,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@ -53,7 +55,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer);
}
};
@ -69,7 +71,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) {
isTokenAvailable = true;
break;
}
@ -88,7 +90,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) {
isTokenAvailable = true;
break;
}
@ -100,7 +102,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
@ -110,7 +112,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
@ -418,7 +420,32 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
new String[] { "", "", "", "", "カタカナ" },
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
}
public void testCombiningMarks() throws Exception {
checkOneTerm(a, "ざ", "ざ"); // hiragana
checkOneTerm(a, "ザ", "ザ"); // katakana
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents
(String fieldName, Reader reader) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
return new TokenStreamComponents(tokenizer);
}
};
checkOneTerm(a, "ざ", ""); // hiragana Bug
checkOneTerm(a, "ザ", "ザ"); // katakana Works
checkOneTerm(a, "壹゙", ""); // ideographic Bug
checkOneTerm(a, "아゙", "아゙"); // hangul Works
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);

View File

@ -51,7 +51,7 @@ public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
}
public UAX29URLEmailTokenizer create(Reader input) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input);
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input);
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@ -173,4 +174,22 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
assertTokenStreamContents(stream,
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
}
/** @deprecated nuke this test in lucene 5.0 */
@Deprecated
public void testMatchVersion() throws Exception {
Reader reader = new StringReader("ざ");
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"ざ"});
reader = new StringReader("ざ");
factory = new UAX29URLEmailTokenizerFactory();
factory.init(Collections.singletonMap("luceneMatchVersion", "3.1"));
stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {""}); // old broken behavior
}
}