mirror of https://github.com/apache/lucene.git
simplify jflex grammars by using difference rather than negation (#515)
Jflex grammars now avoid using complement operator twice as a demorgan-workaround for "macros in char classes". With the latest version of jflex, we can just do the subtraction directly and avoid unnecessary NFA->DFA conversions. This speeds up `generateUAX29URLEmailTokenizer` around 3x.
This commit is contained in:
parent
ec57641ea5
commit
5c746db53e
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "c65096becfadf92ef811f50fbe783201c6d186ea",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "12cdf32aa71cd6dce0d1efe31eacb084188420c7",
|
||||
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ca861d63fe9f0d1049bbcf8979814f14c8615dc7",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "1a72f979479840d9a973830798b2f74b41c3fa94"
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "d92fe45a9f6e12b7aedd446fabbbc0e93e9da0c3",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
|
||||
}
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
|
||||
// file version from 2021 Nov 17, Wed 07:07:01 Coordinated Universal Time
|
||||
// file version from 2021 Dec 4, Sat 07:07:01 Coordinated Universal Time
|
||||
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
|
||||
|
@ -41,7 +41,6 @@ ASCIITLD = "." (
|
|||
| [aA][eE][gG]
|
||||
| [aA][eE][rR][oO]
|
||||
| [aA][eE][tT][nN][aA]
|
||||
| [aA][fF][aA][mM][iI][lL][yY][cC][oO][mM][pP][aA][nN][yY]
|
||||
| [aA][fF][lL]
|
||||
| [aA][fF][rR][iI][cC][aA]
|
||||
| [aA][gG]
|
||||
|
@ -345,7 +344,6 @@ ASCIITLD = "." (
|
|||
| [dD][rR][iI][vV][eE]
|
||||
| [dD][tT][vV]
|
||||
| [dD][uU][bB][aA][iI]
|
||||
| [dD][uU][cC][kK]
|
||||
| [dD][uU][nN][lL][oO][pP]
|
||||
| [dD][uU][pP][oO][nN][tT]
|
||||
| [dD][uU][rR][bB][aA][nN]
|
||||
|
@ -464,7 +462,6 @@ ASCIITLD = "." (
|
|||
| [gG][iI][fF][tT][sS]
|
||||
| [gG][iI][vV][eE][sS]
|
||||
| [gG][iI][vV][iI][nN][gG]
|
||||
| [gG][lL][aA][dD][eE]
|
||||
| [gG][lL][aA][sS][sS]
|
||||
| [gG][lL][eE]
|
||||
| [gG][lL][oO][bB][aA][lL]
|
||||
|
@ -817,7 +814,6 @@ ASCIITLD = "." (
|
|||
| [nN][zZ]
|
||||
| [oO][bB][iI]
|
||||
| [oO][bB][sS][eE][rR][vV][eE][rR]
|
||||
| [oO][fF][fF]
|
||||
| [oO][fF][fF][iI][cC][eE]
|
||||
| [oO][kK][iI][nN][aA][wW][aA]
|
||||
| [oO][lL][aA][yY][aA][nN]
|
||||
|
@ -909,7 +905,6 @@ ASCIITLD = "." (
|
|||
| [qQ][uU][eE][sS][tT]
|
||||
| [rR][aA][cC][iI][nN][gG]
|
||||
| [rR][aA][dD][iI][oO]
|
||||
| [rR][aA][iI][dD]
|
||||
| [rR][eE][aA][dD]
|
||||
| [rR][eE][aA][lL][eE][sS][tT][aA][tT][eE]
|
||||
| [rR][eE][aA][lL][tT][oO][rR]
|
||||
|
@ -977,7 +972,6 @@ ASCIITLD = "." (
|
|||
| [sS][cC][hH][uU][lL][eE]
|
||||
| [sS][cC][hH][wW][aA][rR][zZ]
|
||||
| [sS][cC][iI][eE][nN][cC][eE]
|
||||
| [sS][cC][jJ][oO][hH][nN][sS][oO][nN]
|
||||
| [sS][cC][oO][tT]
|
||||
| [sS][dD]
|
||||
| [sS][eE][aA][rR][cC][hH]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -58,14 +58,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
|
||||
|
||||
Emoji = \p{Emoji}
|
||||
Emoji_Modifier = \p{Emoji_Modifier}
|
||||
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
|
||||
Extended_Pictographic = \p{Extended_Pictographic}
|
||||
|
||||
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||
//
|
||||
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
||||
|
@ -80,18 +72,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
|
|||
|
||||
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
||||
AccidentalEmoji = [©®™\u3030\u303D]
|
||||
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
|
||||
EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
|
||||
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
|
||||
|
||||
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
|
||||
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
|
||||
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
|
||||
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
|
||||
EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
|
||||
|
||||
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
|
||||
|
||||
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||
|
||||
EmojiPresentationSelector = \uFE0F
|
||||
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
||||
|
|
|
@ -24,7 +24,6 @@ aeg
|
|||
aero
|
||||
aetna
|
||||
af
|
||||
afamilycompany
|
||||
afl
|
||||
africa
|
||||
ag
|
||||
|
@ -359,7 +358,6 @@ download
|
|||
drive
|
||||
dtv
|
||||
dubai
|
||||
duck
|
||||
dunlop
|
||||
dupont
|
||||
durban
|
||||
|
@ -493,7 +491,6 @@ gifts
|
|||
gives
|
||||
giving
|
||||
gl
|
||||
glade
|
||||
glass
|
||||
gle
|
||||
global
|
||||
|
@ -877,7 +874,6 @@ nyc
|
|||
nz
|
||||
obi
|
||||
observer
|
||||
off
|
||||
office
|
||||
okinawa
|
||||
olayan
|
||||
|
@ -978,7 +974,6 @@ quebec
|
|||
quest
|
||||
racing
|
||||
radio
|
||||
raid
|
||||
re
|
||||
read
|
||||
realestate
|
||||
|
@ -1055,7 +1050,6 @@ school
|
|||
schule
|
||||
schwarz
|
||||
science
|
||||
scjohnson
|
||||
scot
|
||||
sd
|
||||
se
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "10dcd0c6e0d91c06ec507e8a3e125d144060bc51"
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
|
||||
}
|
|
@ -48,15 +48,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%char
|
||||
%buffer 255
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
|
||||
|
||||
Emoji = \p{Emoji}
|
||||
Emoji_Modifier = \p{Emoji_Modifier}
|
||||
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
|
||||
Extended_Pictographic = \p{Extended_Pictographic}
|
||||
|
||||
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||
//
|
||||
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
||||
|
@ -71,18 +62,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
|
|||
|
||||
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
||||
AccidentalEmoji = [©®™\u3030\u303D]
|
||||
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
|
||||
EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
|
||||
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
|
||||
|
||||
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
|
||||
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
|
||||
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
|
||||
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
|
||||
EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
|
||||
|
||||
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
|
||||
|
||||
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||
EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||
|
||||
EmojiPresentationSelector = \uFE0F
|
||||
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
||||
|
|
Loading…
Reference in New Issue