simplify jflex grammars by using difference rather than negation (#515)

Jflex grammars now avoid using complement operator twice as a demorgan-workaround for "macros in char classes". With the latest version of jflex, we can just do the subtraction directly and avoid unnecessary NFA->DFA conversions. This speeds up `generateUAX29URLEmailTokenizer` around 3x.
This commit is contained in:
Robert Muir 2021-12-06 21:59:13 -05:00
parent d36c70cdd6
commit 9000dfc382
No known key found for this signature in database
GPG Key ID: 817AE1DD322D7ECA
8 changed files with 28513 additions and 28777 deletions

View File

@ -1,5 +1,5 @@
{ {
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13", "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "c65096becfadf92ef811f50fbe783201c6d186ea", "lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "12cdf32aa71cd6dce0d1efe31eacb084188420c7",
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" "property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
} }

View File

@ -1,6 +1,6 @@
{ {
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e", "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13", "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ca861d63fe9f0d1049bbcf8979814f14c8615dc7", "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "d92fe45a9f6e12b7aedd446fabbbc0e93e9da0c3",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "1a72f979479840d9a973830798b2f74b41c3fa94" "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
} }

View File

@ -15,7 +15,7 @@
* limitations under the License. * limitations under the License.
*/ */
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt> // Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
// file version from 2021 Nov 17, Wed 07:07:01 Coordinated Universal Time // file version from 2021 Dec 4, Sat 07:07:01 Coordinated Universal Time
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros // generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD // LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
@ -41,7 +41,6 @@ ASCIITLD = "." (
| [aA][eE][gG] | [aA][eE][gG]
| [aA][eE][rR][oO] | [aA][eE][rR][oO]
| [aA][eE][tT][nN][aA] | [aA][eE][tT][nN][aA]
| [aA][fF][aA][mM][iI][lL][yY][cC][oO][mM][pP][aA][nN][yY]
| [aA][fF][lL] | [aA][fF][lL]
| [aA][fF][rR][iI][cC][aA] | [aA][fF][rR][iI][cC][aA]
| [aA][gG] | [aA][gG]
@ -345,7 +344,6 @@ ASCIITLD = "." (
| [dD][rR][iI][vV][eE] | [dD][rR][iI][vV][eE]
| [dD][tT][vV] | [dD][tT][vV]
| [dD][uU][bB][aA][iI] | [dD][uU][bB][aA][iI]
| [dD][uU][cC][kK]
| [dD][uU][nN][lL][oO][pP] | [dD][uU][nN][lL][oO][pP]
| [dD][uU][pP][oO][nN][tT] | [dD][uU][pP][oO][nN][tT]
| [dD][uU][rR][bB][aA][nN] | [dD][uU][rR][bB][aA][nN]
@ -464,7 +462,6 @@ ASCIITLD = "." (
| [gG][iI][fF][tT][sS] | [gG][iI][fF][tT][sS]
| [gG][iI][vV][eE][sS] | [gG][iI][vV][eE][sS]
| [gG][iI][vV][iI][nN][gG] | [gG][iI][vV][iI][nN][gG]
| [gG][lL][aA][dD][eE]
| [gG][lL][aA][sS][sS] | [gG][lL][aA][sS][sS]
| [gG][lL][eE] | [gG][lL][eE]
| [gG][lL][oO][bB][aA][lL] | [gG][lL][oO][bB][aA][lL]
@ -817,7 +814,6 @@ ASCIITLD = "." (
| [nN][zZ] | [nN][zZ]
| [oO][bB][iI] | [oO][bB][iI]
| [oO][bB][sS][eE][rR][vV][eE][rR] | [oO][bB][sS][eE][rR][vV][eE][rR]
| [oO][fF][fF]
| [oO][fF][fF][iI][cC][eE] | [oO][fF][fF][iI][cC][eE]
| [oO][kK][iI][nN][aA][wW][aA] | [oO][kK][iI][nN][aA][wW][aA]
| [oO][lL][aA][yY][aA][nN] | [oO][lL][aA][yY][aA][nN]
@ -909,7 +905,6 @@ ASCIITLD = "." (
| [qQ][uU][eE][sS][tT] | [qQ][uU][eE][sS][tT]
| [rR][aA][cC][iI][nN][gG] | [rR][aA][cC][iI][nN][gG]
| [rR][aA][dD][iI][oO] | [rR][aA][dD][iI][oO]
| [rR][aA][iI][dD]
| [rR][eE][aA][dD] | [rR][eE][aA][dD]
| [rR][eE][aA][lL][eE][sS][tT][aA][tT][eE] | [rR][eE][aA][lL][eE][sS][tT][aA][tT][eE]
| [rR][eE][aA][lL][tT][oO][rR] | [rR][eE][aA][lL][tT][oO][rR]
@ -977,7 +972,6 @@ ASCIITLD = "." (
| [sS][cC][hH][uU][lL][eE] | [sS][cC][hH][uU][lL][eE]
| [sS][cC][hH][wW][aA][rR][zZ] | [sS][cC][hH][wW][aA][rR][zZ]
| [sS][cC][iI][eE][nN][cC][eE] | [sS][cC][iI][eE][nN][cC][eE]
| [sS][cC][jJ][oO][hH][nN][sS][oO][nN]
| [sS][cC][oO][tT] | [sS][cC][oO][tT]
| [sS][dD] | [sS][dD]
| [sS][eE][aA][rR][cC][hH] | [sS][eE][aA][rR][cC][hH]

View File

@ -58,14 +58,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]* ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
//////////////////////////////////////////////////////////////////////////
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
Emoji = \p{Emoji}
Emoji_Modifier = \p{Emoji_Modifier}
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
Extended_Pictographic = \p{Extended_Pictographic}
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X // UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
// //
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend} // \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
@ -80,18 +72,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK // # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
AccidentalEmoji = [©®™\u3030\u303D] AccidentalEmoji = [©®™\u3030\u303D]
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} ) EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} ) EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel} EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiPresentationSelector = \uFE0F EmojiPresentationSelector = \uFE0F
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} ) EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )

View File

@ -24,7 +24,6 @@ aeg
aero aero
aetna aetna
af af
afamilycompany
afl afl
africa africa
ag ag
@ -359,7 +358,6 @@ download
drive drive
dtv dtv
dubai dubai
duck
dunlop dunlop
dupont dupont
durban durban
@ -493,7 +491,6 @@ gifts
gives gives
giving giving
gl gl
glade
glass glass
gle gle
global global
@ -877,7 +874,6 @@ nyc
nz nz
obi obi
observer observer
off
office office
okinawa okinawa
olayan olayan
@ -978,7 +974,6 @@ quebec
quest quest
racing racing
radio radio
raid
re re
read read
realestate realestate
@ -1055,7 +1050,6 @@ school
schule schule
schwarz schwarz
science science
scjohnson
scot scot
sd sd
se se

View File

@ -1,5 +1,5 @@
{ {
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e", "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe", "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "10dcd0c6e0d91c06ec507e8a3e125d144060bc51" "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
} }

View File

@ -48,15 +48,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%char %char
%buffer 255 %buffer 255
//////////////////////////////////////////////////////////////////////////
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
Emoji = \p{Emoji}
Emoji_Modifier = \p{Emoji_Modifier}
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
Extended_Pictographic = \p{Extended_Pictographic}
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X // UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
// //
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend} // \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
@ -71,18 +62,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK // # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
AccidentalEmoji = [©®™\u3030\u303D] AccidentalEmoji = [©®™\u3030\u303D]
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} ) EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} ) EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel} EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
EmojiPresentationSelector = \uFE0F EmojiPresentationSelector = \uFE0F
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} ) EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )