mirror of https://github.com/apache/lucene.git
LUCENE-3361: port url+email tokenizer to standardtokenizerinterface, fix combining marks bug
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1154936 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
718f42479f
commit
ef56f5d551
|
@ -542,10 +542,11 @@ Bug fixes
|
|||
* LUCENE-3340: Fixed case where IndexWriter was not flushing at
|
||||
exactly maxBufferedDeleteTerms (Mike McCandless)
|
||||
|
||||
* LUCENE-3358: StandardTokenizer wrongly discarded combining marks attached
|
||||
to Han or Hiragana characters, this is fixed if you supply Version >= 3.4
|
||||
If you supply a previous lucene version, you get the old buggy behavior
|
||||
for backwards compatibility. (Trejkaz, Robert Muir)
|
||||
* LUCENE-3358, LUCENE-3361: StandardTokenizer and UAX29URLEmailTokenizer
|
||||
wrongly discarded combining marks attached to Han or Hiragana characters,
|
||||
this is fixed if you supply Version >= 3.4 If you supply a previous
|
||||
lucene version, you get the old buggy behavior for backwards compatibility.
|
||||
(Trejkaz, Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@
|
|||
nobak="on"/>
|
||||
</target>
|
||||
|
||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check,gen-tlds" if="jflex.present">
|
||||
<target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
|
@ -76,9 +76,12 @@
|
|||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex"
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard"
|
||||
nobak="on" />
|
||||
<jflex file="src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/standard/std31"
|
||||
nobak="on" />
|
||||
</target>
|
||||
|
||||
<target name="clean-jflex">
|
||||
|
@ -86,7 +89,7 @@
|
|||
<fileset dir="src/java/org/apache/lucene/analysis/wikipedia" includes="*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
<fileset dir="src/java/org/apache/lucene/analysis/standard" includes="*.java">
|
||||
<fileset dir="src/java/org/apache/lucene/analysis/standard" includes="**/*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
</delete>
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
|
||||
// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
|
||||
// file version from Thursday, August 4, 2011 11:34:20 AM UTC
|
||||
// generated on Thursday, August 4, 2011 11:46:19 PM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
|
@ -288,6 +288,7 @@ ASCIITLD = "." (
|
|||
| [xX][nN]--3[eE]0[bB]707[eE]
|
||||
| [xX][nN]--45[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
|
||||
| [xX][nN]--90[aA]3[aA][cC]
|
||||
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
|
||||
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
|
||||
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
|
||||
|
@ -305,9 +306,11 @@ ASCIITLD = "." (
|
|||
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
|
||||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
|
||||
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
|
||||
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
|
||||
| [xX][nN]--[oO]3[cC][wW]4[hH]
|
||||
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
|
||||
|
@ -321,6 +324,7 @@ ASCIITLD = "." (
|
|||
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
|
||||
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
|
||||
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
|
||||
| [xX][xX][xX]
|
||||
| [yY][eE]
|
||||
| [yY][tT]
|
||||
| [zZ][aA]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -17,16 +17,7 @@ package org.apache.lucene.analysis.standard;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* This class implements Word Break rules from the Unicode Text Segmentation
|
||||
|
@ -49,20 +40,14 @@ import org.apache.lucene.util.AttributeSource;
|
|||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
%apiprivate
|
||||
%class UAX29URLEmailTokenizer
|
||||
%extends Tokenizer
|
||||
%type boolean
|
||||
%class UAX29URLEmailTokenizerImpl
|
||||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
%init{
|
||||
super(in);
|
||||
%init}
|
||||
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
|
@ -89,6 +74,8 @@ MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
|||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
HanEx = {Han} ({Format} | {Extend})*
|
||||
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
|
@ -170,16 +157,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
|
||||
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
|
||||
|
||||
/** Numbers */
|
||||
public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
|
||||
|
||||
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
|
||||
public static final String URL_TYPE = "<URL>";
|
||||
|
||||
/** E-mail addresses */
|
||||
public static final String EMAIL_TYPE = "<EMAIL>";
|
||||
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
|
@ -189,114 +170,30 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
* <p>
|
||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||
*/
|
||||
public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
|
||||
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
|
||||
|
||||
public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
|
||||
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
|
||||
|
||||
public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
|
||||
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
|
||||
|
||||
public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
|
||||
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt
|
||||
= addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
private int posIncr;
|
||||
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
|
||||
|
||||
|
||||
/**
|
||||
* @param source The AttributeSource to use
|
||||
* @param input The input reader
|
||||
*/
|
||||
public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
|
||||
super(source, input);
|
||||
zzReader = input;
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param factory The AttributeFactory to use
|
||||
* @param input The input reader
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
|
||||
super(factory, input);
|
||||
zzReader = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max allowed token length. Any token longer than this is skipped.
|
||||
* @param length the new max allowed token length
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the max allowed token length. Any token longer than this is
|
||||
* skipped.
|
||||
* @return the max allowed token length
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(yychar + yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
yyreset(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// This method is required because of two JFlex limitations:
|
||||
// 1. No way to insert code at the beginning of the generated scanning
|
||||
// get-next-token method; and
|
||||
// 2. No way to declare @Override on the generated scanning method.
|
||||
clearAttributes();
|
||||
posIncr = 1;
|
||||
return getNextToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Populates this TokenStream's CharTermAttribute and OffsetAttribute from
|
||||
* the current match, the TypeAttribute from the passed-in tokenType, and
|
||||
* the PositionIncrementAttribute to one, unless the immediately previous
|
||||
* token(s) was/were skipped because maxTokenLength was exceeded, in which
|
||||
* case the PositionIncrementAttribute is set to one plus the number of
|
||||
* skipped overly long tokens.
|
||||
* <p/>
|
||||
* If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
|
||||
* and false is returned.
|
||||
*
|
||||
* @param tokenType The type of the matching token
|
||||
* @return true there is a token available (not too long); false otherwise
|
||||
*/
|
||||
private boolean populateAttributes(String tokenType) {
|
||||
boolean isTokenAvailable = false;
|
||||
if (yylength() > maxTokenLength) {
|
||||
// When we skip a too-long token, we treat it like a stopword, introducing
|
||||
// a position increment gap
|
||||
++posIncr;
|
||||
} else {
|
||||
termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
offsetAtt.setOffset(correctOffset(yychar),
|
||||
correctOffset(yychar + yylength()));
|
||||
typeAtt.setType(tokenType);
|
||||
isTokenAvailable = true;
|
||||
}
|
||||
return isTokenAvailable;
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
|
@ -305,10 +202,10 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return false; }
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
{URL} { if (populateAttributes(URL_TYPE)) return true; }
|
||||
{EMAIL} {if (populateAttributes(EMAIL_TYPE)) return true; }
|
||||
{URL} { return URL_TYPE; }
|
||||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
|
@ -320,14 +217,14 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
{HangulEx}+
|
||||
{ if (populateAttributes(HANGUL_TYPE)) return true; }
|
||||
{ return HANGUL_TYPE; }
|
||||
|
||||
{KatakanaEx}+
|
||||
{ if (populateAttributes(KATAKANA_TYPE)) return true; }
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
|
@ -345,7 +242,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ if (populateAttributes(WORD_TYPE)) return true; }
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
|
@ -367,12 +264,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
|
||||
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
|
||||
{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
|
||||
{HanEx} { return IDEOGRAPHIC_TYPE; }
|
||||
{HiraganaEx} { return HIRAGANA_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB3. CR × LF
|
|
@ -0,0 +1,330 @@
|
|||
/*
|
||||
* Copyright 2001-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
|
||||
// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
[aA][cC]
|
||||
| [aA][dD]
|
||||
| [aA][eE]
|
||||
| [aA][eE][rR][oO]
|
||||
| [aA][fF]
|
||||
| [aA][gG]
|
||||
| [aA][iI]
|
||||
| [aA][lL]
|
||||
| [aA][mM]
|
||||
| [aA][nN]
|
||||
| [aA][oO]
|
||||
| [aA][qQ]
|
||||
| [aA][rR]
|
||||
| [aA][rR][pP][aA]
|
||||
| [aA][sS]
|
||||
| [aA][sS][iI][aA]
|
||||
| [aA][tT]
|
||||
| [aA][uU]
|
||||
| [aA][wW]
|
||||
| [aA][xX]
|
||||
| [aA][zZ]
|
||||
| [bB][aA]
|
||||
| [bB][bB]
|
||||
| [bB][dD]
|
||||
| [bB][eE]
|
||||
| [bB][fF]
|
||||
| [bB][gG]
|
||||
| [bB][hH]
|
||||
| [bB][iI]
|
||||
| [bB][iI][zZ]
|
||||
| [bB][jJ]
|
||||
| [bB][mM]
|
||||
| [bB][nN]
|
||||
| [bB][oO]
|
||||
| [bB][rR]
|
||||
| [bB][sS]
|
||||
| [bB][tT]
|
||||
| [bB][vV]
|
||||
| [bB][wW]
|
||||
| [bB][yY]
|
||||
| [bB][zZ]
|
||||
| [cC][aA]
|
||||
| [cC][aA][tT]
|
||||
| [cC][cC]
|
||||
| [cC][dD]
|
||||
| [cC][fF]
|
||||
| [cC][gG]
|
||||
| [cC][hH]
|
||||
| [cC][iI]
|
||||
| [cC][kK]
|
||||
| [cC][lL]
|
||||
| [cC][mM]
|
||||
| [cC][nN]
|
||||
| [cC][oO]
|
||||
| [cC][oO][mM]
|
||||
| [cC][oO][oO][pP]
|
||||
| [cC][rR]
|
||||
| [cC][uU]
|
||||
| [cC][vV]
|
||||
| [cC][xX]
|
||||
| [cC][yY]
|
||||
| [cC][zZ]
|
||||
| [dD][eE]
|
||||
| [dD][jJ]
|
||||
| [dD][kK]
|
||||
| [dD][mM]
|
||||
| [dD][oO]
|
||||
| [dD][zZ]
|
||||
| [eE][cC]
|
||||
| [eE][dD][uU]
|
||||
| [eE][eE]
|
||||
| [eE][gG]
|
||||
| [eE][rR]
|
||||
| [eE][sS]
|
||||
| [eE][tT]
|
||||
| [eE][uU]
|
||||
| [fF][iI]
|
||||
| [fF][jJ]
|
||||
| [fF][kK]
|
||||
| [fF][mM]
|
||||
| [fF][oO]
|
||||
| [fF][rR]
|
||||
| [gG][aA]
|
||||
| [gG][bB]
|
||||
| [gG][dD]
|
||||
| [gG][eE]
|
||||
| [gG][fF]
|
||||
| [gG][gG]
|
||||
| [gG][hH]
|
||||
| [gG][iI]
|
||||
| [gG][lL]
|
||||
| [gG][mM]
|
||||
| [gG][nN]
|
||||
| [gG][oO][vV]
|
||||
| [gG][pP]
|
||||
| [gG][qQ]
|
||||
| [gG][rR]
|
||||
| [gG][sS]
|
||||
| [gG][tT]
|
||||
| [gG][uU]
|
||||
| [gG][wW]
|
||||
| [gG][yY]
|
||||
| [hH][kK]
|
||||
| [hH][mM]
|
||||
| [hH][nN]
|
||||
| [hH][rR]
|
||||
| [hH][tT]
|
||||
| [hH][uU]
|
||||
| [iI][dD]
|
||||
| [iI][eE]
|
||||
| [iI][lL]
|
||||
| [iI][mM]
|
||||
| [iI][nN]
|
||||
| [iI][nN][fF][oO]
|
||||
| [iI][nN][tT]
|
||||
| [iI][oO]
|
||||
| [iI][qQ]
|
||||
| [iI][rR]
|
||||
| [iI][sS]
|
||||
| [iI][tT]
|
||||
| [jJ][eE]
|
||||
| [jJ][mM]
|
||||
| [jJ][oO]
|
||||
| [jJ][oO][bB][sS]
|
||||
| [jJ][pP]
|
||||
| [kK][eE]
|
||||
| [kK][gG]
|
||||
| [kK][hH]
|
||||
| [kK][iI]
|
||||
| [kK][mM]
|
||||
| [kK][nN]
|
||||
| [kK][pP]
|
||||
| [kK][rR]
|
||||
| [kK][wW]
|
||||
| [kK][yY]
|
||||
| [kK][zZ]
|
||||
| [lL][aA]
|
||||
| [lL][bB]
|
||||
| [lL][cC]
|
||||
| [lL][iI]
|
||||
| [lL][kK]
|
||||
| [lL][rR]
|
||||
| [lL][sS]
|
||||
| [lL][tT]
|
||||
| [lL][uU]
|
||||
| [lL][vV]
|
||||
| [lL][yY]
|
||||
| [mM][aA]
|
||||
| [mM][cC]
|
||||
| [mM][dD]
|
||||
| [mM][eE]
|
||||
| [mM][gG]
|
||||
| [mM][hH]
|
||||
| [mM][iI][lL]
|
||||
| [mM][kK]
|
||||
| [mM][lL]
|
||||
| [mM][mM]
|
||||
| [mM][nN]
|
||||
| [mM][oO]
|
||||
| [mM][oO][bB][iI]
|
||||
| [mM][pP]
|
||||
| [mM][qQ]
|
||||
| [mM][rR]
|
||||
| [mM][sS]
|
||||
| [mM][tT]
|
||||
| [mM][uU]
|
||||
| [mM][uU][sS][eE][uU][mM]
|
||||
| [mM][vV]
|
||||
| [mM][wW]
|
||||
| [mM][xX]
|
||||
| [mM][yY]
|
||||
| [mM][zZ]
|
||||
| [nN][aA]
|
||||
| [nN][aA][mM][eE]
|
||||
| [nN][cC]
|
||||
| [nN][eE]
|
||||
| [nN][eE][tT]
|
||||
| [nN][fF]
|
||||
| [nN][gG]
|
||||
| [nN][iI]
|
||||
| [nN][lL]
|
||||
| [nN][oO]
|
||||
| [nN][pP]
|
||||
| [nN][rR]
|
||||
| [nN][uU]
|
||||
| [nN][zZ]
|
||||
| [oO][mM]
|
||||
| [oO][rR][gG]
|
||||
| [pP][aA]
|
||||
| [pP][eE]
|
||||
| [pP][fF]
|
||||
| [pP][gG]
|
||||
| [pP][hH]
|
||||
| [pP][kK]
|
||||
| [pP][lL]
|
||||
| [pP][mM]
|
||||
| [pP][nN]
|
||||
| [pP][rR]
|
||||
| [pP][rR][oO]
|
||||
| [pP][sS]
|
||||
| [pP][tT]
|
||||
| [pP][wW]
|
||||
| [pP][yY]
|
||||
| [qQ][aA]
|
||||
| [rR][eE]
|
||||
| [rR][oO]
|
||||
| [rR][sS]
|
||||
| [rR][uU]
|
||||
| [rR][wW]
|
||||
| [sS][aA]
|
||||
| [sS][bB]
|
||||
| [sS][cC]
|
||||
| [sS][dD]
|
||||
| [sS][eE]
|
||||
| [sS][gG]
|
||||
| [sS][hH]
|
||||
| [sS][iI]
|
||||
| [sS][jJ]
|
||||
| [sS][kK]
|
||||
| [sS][lL]
|
||||
| [sS][mM]
|
||||
| [sS][nN]
|
||||
| [sS][oO]
|
||||
| [sS][rR]
|
||||
| [sS][tT]
|
||||
| [sS][uU]
|
||||
| [sS][vV]
|
||||
| [sS][yY]
|
||||
| [sS][zZ]
|
||||
| [tT][cC]
|
||||
| [tT][dD]
|
||||
| [tT][eE][lL]
|
||||
| [tT][fF]
|
||||
| [tT][gG]
|
||||
| [tT][hH]
|
||||
| [tT][jJ]
|
||||
| [tT][kK]
|
||||
| [tT][lL]
|
||||
| [tT][mM]
|
||||
| [tT][nN]
|
||||
| [tT][oO]
|
||||
| [tT][pP]
|
||||
| [tT][rR]
|
||||
| [tT][rR][aA][vV][eE][lL]
|
||||
| [tT][tT]
|
||||
| [tT][vV]
|
||||
| [tT][wW]
|
||||
| [tT][zZ]
|
||||
| [uU][aA]
|
||||
| [uU][gG]
|
||||
| [uU][kK]
|
||||
| [uU][sS]
|
||||
| [uU][yY]
|
||||
| [uU][zZ]
|
||||
| [vV][aA]
|
||||
| [vV][cC]
|
||||
| [vV][eE]
|
||||
| [vV][gG]
|
||||
| [vV][iI]
|
||||
| [vV][nN]
|
||||
| [vV][uU]
|
||||
| [wW][fF]
|
||||
| [wW][sS]
|
||||
| [xX][nN]--0[zZ][wW][mM]56[dD]
|
||||
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
|
||||
| [xX][nN]--3[eE]0[bB]707[eE]
|
||||
| [xX][nN]--45[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
|
||||
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
|
||||
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
|
||||
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
|
||||
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
|
||||
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
|
||||
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
|
||||
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
|
||||
| [xX][nN]--[gG]6[wW]251[dD]
|
||||
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
|
||||
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
|
||||
| [xX][nN]--[jJ]6[wW]193[gG]
|
||||
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
|
||||
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
|
||||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
|
||||
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
|
||||
| [xX][nN]--[oO]3[cC][wW]4[hH]
|
||||
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
|
||||
| [xX][nN]--[pP]1[aA][iI]
|
||||
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
|
||||
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[wW][gG][bB][hH]1[cC]
|
||||
| [xX][nN]--[wW][gG][bB][lL]6[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
|
||||
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
|
||||
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
|
||||
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
|
||||
| [yY][eE]
|
||||
| [yY][tT]
|
||||
| [zZ][aA]
|
||||
| [zZ][mM]
|
||||
| [zZ][wW]
|
||||
) "."? // Accept trailing root (empty) domain
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.6.0.0 on Wednesday, February 9, 2011 4:45:11 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
ALetterSupp = (
|
||||
([\ud80d][\uDC00-\uDC2E])
|
||||
| ([\ud80c][\uDC00-\uDFFF])
|
||||
| ([\ud809][\uDC00-\uDC62])
|
||||
| ([\ud808][\uDC00-\uDF6E])
|
||||
| ([\ud81a][\uDC00-\uDE38])
|
||||
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
|
||||
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
|
||||
| ([\ud801][\uDC00-\uDC9D])
|
||||
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
|
||||
| ([\ud803][\uDC00-\uDC48])
|
||||
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
|
||||
)
|
||||
FormatSupp = (
|
||||
([\ud804][\uDCBD])
|
||||
| ([\ud834][\uDD73-\uDD7A])
|
||||
| ([\udb40][\uDC01\uDC20-\uDC7F])
|
||||
)
|
||||
ExtendSupp = (
|
||||
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
|
||||
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
|
||||
| ([\ud800][\uDDFD])
|
||||
| ([\udb40][\uDD00-\uDDEF])
|
||||
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud804][\uDC66-\uDC6F])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
KatakanaSupp = (
|
||||
([\ud82c][\uDC00])
|
||||
)
|
||||
MidLetterSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ComplexContextSupp = (
|
||||
[]
|
||||
)
|
||||
HanSupp = (
|
||||
([\ud87e][\uDC00-\uDE1D])
|
||||
| ([\ud86b][\uDC00-\uDFFF])
|
||||
| ([\ud86a][\uDC00-\uDFFF])
|
||||
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
|
||||
| ([\ud868][\uDC00-\uDFFF])
|
||||
| ([\ud86e][\uDC00-\uDC1D])
|
||||
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
|
||||
| ([\ud86c][\uDC00-\uDFFF])
|
||||
| ([\ud863][\uDC00-\uDFFF])
|
||||
| ([\ud862][\uDC00-\uDFFF])
|
||||
| ([\ud861][\uDC00-\uDFFF])
|
||||
| ([\ud860][\uDC00-\uDFFF])
|
||||
| ([\ud867][\uDC00-\uDFFF])
|
||||
| ([\ud866][\uDC00-\uDFFF])
|
||||
| ([\ud865][\uDC00-\uDFFF])
|
||||
| ([\ud864][\uDC00-\uDFFF])
|
||||
| ([\ud858][\uDC00-\uDFFF])
|
||||
| ([\ud859][\uDC00-\uDFFF])
|
||||
| ([\ud85a][\uDC00-\uDFFF])
|
||||
| ([\ud85b][\uDC00-\uDFFF])
|
||||
| ([\ud85c][\uDC00-\uDFFF])
|
||||
| ([\ud85d][\uDC00-\uDFFF])
|
||||
| ([\ud85e][\uDC00-\uDFFF])
|
||||
| ([\ud85f][\uDC00-\uDFFF])
|
||||
| ([\ud850][\uDC00-\uDFFF])
|
||||
| ([\ud851][\uDC00-\uDFFF])
|
||||
| ([\ud852][\uDC00-\uDFFF])
|
||||
| ([\ud853][\uDC00-\uDFFF])
|
||||
| ([\ud854][\uDC00-\uDFFF])
|
||||
| ([\ud855][\uDC00-\uDFFF])
|
||||
| ([\ud856][\uDC00-\uDFFF])
|
||||
| ([\ud857][\uDC00-\uDFFF])
|
||||
| ([\ud849][\uDC00-\uDFFF])
|
||||
| ([\ud848][\uDC00-\uDFFF])
|
||||
| ([\ud84b][\uDC00-\uDFFF])
|
||||
| ([\ud84a][\uDC00-\uDFFF])
|
||||
| ([\ud84d][\uDC00-\uDFFF])
|
||||
| ([\ud84c][\uDC00-\uDFFF])
|
||||
| ([\ud84f][\uDC00-\uDFFF])
|
||||
| ([\ud84e][\uDC00-\uDFFF])
|
||||
| ([\ud841][\uDC00-\uDFFF])
|
||||
| ([\ud840][\uDC00-\uDFFF])
|
||||
| ([\ud843][\uDC00-\uDFFF])
|
||||
| ([\ud842][\uDC00-\uDFFF])
|
||||
| ([\ud845][\uDC00-\uDFFF])
|
||||
| ([\ud844][\uDC00-\uDFFF])
|
||||
| ([\ud847][\uDC00-\uDFFF])
|
||||
| ([\ud846][\uDC00-\uDFFF])
|
||||
)
|
||||
HiraganaSupp = (
|
||||
([\ud83c][\uDE00])
|
||||
| ([\ud82c][\uDC01])
|
||||
)
|
|
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%function getNextToken
|
||||
%char
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
|
||||
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,269 @@
|
|||
package org.apache.lucene.analysis.standard.std31;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* This class implements UAX29URLEmailTokenizer, except with a bug
|
||||
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
|
||||
* characters would be split from combining characters:
|
||||
* @deprecated This class is only for exact backwards compatibility
|
||||
*/
|
||||
@Deprecated
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
%class UAX29URLEmailTokenizerImpl31
|
||||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||
Han = ([\p{Script:Han}] | {HanSupp})
|
||||
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||
|
||||
// Script=Hangul & Aletter
|
||||
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||
// RFC-1738: Uniform Resource Locators (URL)
|
||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||
|
||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||
|
||||
URIunreserved = [-._~A-Za-z0-9]
|
||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||
URIsubDelims = [!$&'()*+,;=]
|
||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIport = ":" [0-9]{1,5}
|
||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||
|
||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||
|
||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||
HTTPpath = ("/" {HTTPsegment})*
|
||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||
|
||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||
FTPscheme = [fF][tT][pP] "://"
|
||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||
|
||||
FILEscheme = [fF][iI][lL][eE] "://"
|
||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||
|
||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||
|
||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||
// reminder that they are acceptable bracketed host forms.
|
||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
|
||||
|
||||
/** Numbers */
|
||||
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* <p>
|
||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
|
||||
|
||||
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
|
||||
|
||||
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
|
||||
|
||||
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
|
||||
|
||||
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
|
||||
|
||||
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
{URL} { return URL_TYPE; }
|
||||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
{HangulEx}+
|
||||
{ return HANGUL_TYPE; }
|
||||
|
||||
{KatakanaEx}+
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
//
|
||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||
// boundary property values based on criteria outside of the scope of this
|
||||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||
// Lao, etc.) are kept together. This grammar does the same below.
|
||||
//
|
||||
// See also the Unicode Line Breaking Algorithm:
|
||||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
{Han} { return IDEOGRAPHIC_TYPE; }
|
||||
{Hiragana} { return HIRAGANA_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -5,9 +5,11 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
|
@ -44,7 +46,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(new StringReader(input));
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
|
@ -53,7 +55,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents
|
||||
(String fieldName, Reader reader) {
|
||||
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
|
@ -69,7 +71,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == UAX29URLEmailTokenizer.URL_TYPE) {
|
||||
if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
|
@ -88,7 +90,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
public final boolean incrementToken() throws java.io.IOException {
|
||||
boolean isTokenAvailable = false;
|
||||
while (input.incrementToken()) {
|
||||
if (typeAtt.type() == UAX29URLEmailTokenizer.EMAIL_TYPE) {
|
||||
if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) {
|
||||
isTokenAvailable = true;
|
||||
break;
|
||||
}
|
||||
|
@ -100,7 +102,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer urlAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs
|
||||
TokenFilter filter = new URLFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
|
@ -110,7 +112,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
private Analyzer emailAnalyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
TokenFilter filter = new EmailFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
@ -419,6 +421,31 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
||||
}
|
||||
|
||||
public void testCombiningMarks() throws Exception {
|
||||
checkOneTerm(a, "ざ", "ざ"); // hiragana
|
||||
checkOneTerm(a, "ザ", "ザ"); // katakana
|
||||
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
|
||||
@Deprecated
|
||||
public void testCombiningMarksBackwards() throws Exception {
|
||||
Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents
|
||||
(String fieldName, Reader reader) {
|
||||
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "ざ", "さ"); // hiragana Bug
|
||||
checkOneTerm(a, "ザ", "ザ"); // katakana Works
|
||||
checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul Works
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
|
|
|
@ -51,7 +51,7 @@ public class UAX29URLEmailTokenizerFactory extends BaseTokenizerFactory {
|
|||
}
|
||||
|
||||
public UAX29URLEmailTokenizer create(Reader input) {
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(input);
|
||||
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input);
|
||||
tokenizer.setMaxTokenLength(maxTokenLength);
|
||||
return tokenizer;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -173,4 +174,22 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenTestCase {
|
|||
assertTokenStreamContents(stream,
|
||||
new String[] {"one", "two", "three", longWord, "four", "five", "six" });
|
||||
}
|
||||
|
||||
/** @deprecated nuke this test in lucene 5.0 */
|
||||
@Deprecated
|
||||
public void testMatchVersion() throws Exception {
|
||||
Reader reader = new StringReader("ざ");
|
||||
UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"ざ"});
|
||||
|
||||
reader = new StringReader("ざ");
|
||||
factory = new UAX29URLEmailTokenizerFactory();
|
||||
factory.init(Collections.singletonMap("luceneMatchVersion", "3.1"));
|
||||
stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"さ"}); // old broken behavior
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue