simplify jflex grammars by using difference rather than negation (#515)

Jflex grammars now avoid using complement operator twice as a demorgan-workaround for "macros in char classes". With the latest version of jflex, we can just do the subtraction directly and avoid unnecessary NFA->DFA conversions. This speeds up `generateUAX29URLEmailTokenizer` around 3x.
This commit is contained in:
Robert Muir 2021-12-06 21:59:13 -05:00 committed by GitHub
parent ec57641ea5
commit 5c746db53e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 28513 additions and 28777 deletions

View File

@ -1,5 +1,5 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "c65096becfadf92ef811f50fbe783201c6d186ea",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "12cdf32aa71cd6dce0d1efe31eacb084188420c7",
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
}

View File

@ -1,6 +1,6 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ca861d63fe9f0d1049bbcf8979814f14c8615dc7",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "1a72f979479840d9a973830798b2f74b41c3fa94"
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "d92fe45a9f6e12b7aedd446fabbbc0e93e9da0c3",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
}

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
// file version from 2021 Nov 17, Wed 07:07:01 Coordinated Universal Time
// file version from 2021 Dec 4, Sat 07:07:01 Coordinated Universal Time
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
@ -41,7 +41,6 @@ ASCIITLD = "." (
| [aA][eE][gG]
| [aA][eE][rR][oO]
| [aA][eE][tT][nN][aA]
| [aA][fF][aA][mM][iI][lL][yY][cC][oO][mM][pP][aA][nN][yY]
| [aA][fF][lL]
| [aA][fF][rR][iI][cC][aA]
| [aA][gG]
@ -345,7 +344,6 @@ ASCIITLD = "." (
| [dD][rR][iI][vV][eE]
| [dD][tT][vV]
| [dD][uU][bB][aA][iI]
| [dD][uU][cC][kK]
| [dD][uU][nN][lL][oO][pP]
| [dD][uU][pP][oO][nN][tT]
| [dD][uU][rR][bB][aA][nN]
@ -464,7 +462,6 @@ ASCIITLD = "." (
| [gG][iI][fF][tT][sS]
| [gG][iI][vV][eE][sS]
| [gG][iI][vV][iI][nN][gG]
| [gG][lL][aA][dD][eE]
| [gG][lL][aA][sS][sS]
| [gG][lL][eE]
| [gG][lL][oO][bB][aA][lL]
@ -817,7 +814,6 @@ ASCIITLD = "." (
| [nN][zZ]
| [oO][bB][iI]
| [oO][bB][sS][eE][rR][vV][eE][rR]
| [oO][fF][fF]
| [oO][fF][fF][iI][cC][eE]
| [oO][kK][iI][nN][aA][wW][aA]
| [oO][lL][aA][yY][aA][nN]
@ -909,7 +905,6 @@ ASCIITLD = "." (
| [qQ][uU][eE][sS][tT]
| [rR][aA][cC][iI][nN][gG]
| [rR][aA][dD][iI][oO]
| [rR][aA][iI][dD]
| [rR][eE][aA][dD]
| [rR][eE][aA][lL][eE][sS][tT][aA][tT][eE]
| [rR][eE][aA][lL][tT][oO][rR]
@ -977,7 +972,6 @@ ASCIITLD = "." (
| [sS][cC][hH][uU][lL][eE]
| [sS][cC][hH][wW][aA][rR][zZ]
| [sS][cC][iI][eE][nN][cC][eE]
| [sS][cC][jJ][oO][hH][nN][sS][oO][nN]
| [sS][cC][oO][tT]
| [sS][dD]
| [sS][eE][aA][rR][cC][hH]

View File

@ -58,14 +58,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
//////////////////////////////////////////////////////////////////////////
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
Emoji = \p{Emoji}
Emoji_Modifier = \p{Emoji_Modifier}
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
Extended_Pictographic = \p{Extended_Pictographic}
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
@ -80,18 +72,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
AccidentalEmoji = [©®™\u3030\u303D]
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiPresentationSelector = \uFE0F
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )

View File

@ -24,7 +24,6 @@ aeg
aero
aetna
af
afamilycompany
afl
africa
ag
@ -359,7 +358,6 @@ download
drive
dtv
dubai
duck
dunlop
dupont
durban
@ -493,7 +491,6 @@ gifts
gives
giving
gl
glade
glass
gle
global
@ -877,7 +874,6 @@ nyc
nz
obi
observer
off
office
okinawa
olayan
@ -978,7 +974,6 @@ quebec
quest
racing
radio
raid
re
read
realestate
@ -1055,7 +1050,6 @@ school
schule
schwarz
science
scjohnson
scot
sd
se

View File

@ -1,5 +1,5 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "10dcd0c6e0d91c06ec507e8a3e125d144060bc51"
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
}

View File

@ -48,15 +48,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%char
%buffer 255
//////////////////////////////////////////////////////////////////////////
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
Emoji = \p{Emoji}
Emoji_Modifier = \p{Emoji_Modifier}
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
Extended_Pictographic = \p{Extended_Pictographic}
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
//
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
@ -71,18 +62,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
AccidentalEmoji = [©®™\u3030\u303D]
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiPresentationSelector = \uFE0F
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )