mirror of
https://github.com/apache/lucene.git
synced 2025-02-07 10:38:40 +00:00
simplify jflex grammars by using difference rather than negation (#515)
Jflex grammars now avoid using complement operator twice as a demorgan-workaround for "macros in char classes". With the latest version of jflex, we can just do the subtraction directly and avoid unnecessary NFA->DFA conversions. This speeds up `generateUAX29URLEmailTokenizer` around 3x.
This commit is contained in:
parent
d36c70cdd6
commit
9000dfc382
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
|
||||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "c65096becfadf92ef811f50fbe783201c6d186ea",
|
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "12cdf32aa71cd6dce0d1efe31eacb084188420c7",
|
||||||
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||||
}
|
}
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b79e24f254afcd138287049394f22bac9e094e13",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "728f9060d6e8e5f27b1ab6ab5468b1483b2b22cd",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ca861d63fe9f0d1049bbcf8979814f14c8615dc7",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "d92fe45a9f6e12b7aedd446fabbbc0e93e9da0c3",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "1a72f979479840d9a973830798b2f74b41c3fa94"
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
|
||||||
}
|
}
|
@ -15,7 +15,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
|
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
|
||||||
// file version from 2021 Nov 17, Wed 07:07:01 Coordinated Universal Time
|
// file version from 2021 Dec 4, Sat 07:07:01 Coordinated Universal Time
|
||||||
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||||
|
|
||||||
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
|
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
|
||||||
@ -41,7 +41,6 @@ ASCIITLD = "." (
|
|||||||
| [aA][eE][gG]
|
| [aA][eE][gG]
|
||||||
| [aA][eE][rR][oO]
|
| [aA][eE][rR][oO]
|
||||||
| [aA][eE][tT][nN][aA]
|
| [aA][eE][tT][nN][aA]
|
||||||
| [aA][fF][aA][mM][iI][lL][yY][cC][oO][mM][pP][aA][nN][yY]
|
|
||||||
| [aA][fF][lL]
|
| [aA][fF][lL]
|
||||||
| [aA][fF][rR][iI][cC][aA]
|
| [aA][fF][rR][iI][cC][aA]
|
||||||
| [aA][gG]
|
| [aA][gG]
|
||||||
@ -345,7 +344,6 @@ ASCIITLD = "." (
|
|||||||
| [dD][rR][iI][vV][eE]
|
| [dD][rR][iI][vV][eE]
|
||||||
| [dD][tT][vV]
|
| [dD][tT][vV]
|
||||||
| [dD][uU][bB][aA][iI]
|
| [dD][uU][bB][aA][iI]
|
||||||
| [dD][uU][cC][kK]
|
|
||||||
| [dD][uU][nN][lL][oO][pP]
|
| [dD][uU][nN][lL][oO][pP]
|
||||||
| [dD][uU][pP][oO][nN][tT]
|
| [dD][uU][pP][oO][nN][tT]
|
||||||
| [dD][uU][rR][bB][aA][nN]
|
| [dD][uU][rR][bB][aA][nN]
|
||||||
@ -464,7 +462,6 @@ ASCIITLD = "." (
|
|||||||
| [gG][iI][fF][tT][sS]
|
| [gG][iI][fF][tT][sS]
|
||||||
| [gG][iI][vV][eE][sS]
|
| [gG][iI][vV][eE][sS]
|
||||||
| [gG][iI][vV][iI][nN][gG]
|
| [gG][iI][vV][iI][nN][gG]
|
||||||
| [gG][lL][aA][dD][eE]
|
|
||||||
| [gG][lL][aA][sS][sS]
|
| [gG][lL][aA][sS][sS]
|
||||||
| [gG][lL][eE]
|
| [gG][lL][eE]
|
||||||
| [gG][lL][oO][bB][aA][lL]
|
| [gG][lL][oO][bB][aA][lL]
|
||||||
@ -817,7 +814,6 @@ ASCIITLD = "." (
|
|||||||
| [nN][zZ]
|
| [nN][zZ]
|
||||||
| [oO][bB][iI]
|
| [oO][bB][iI]
|
||||||
| [oO][bB][sS][eE][rR][vV][eE][rR]
|
| [oO][bB][sS][eE][rR][vV][eE][rR]
|
||||||
| [oO][fF][fF]
|
|
||||||
| [oO][fF][fF][iI][cC][eE]
|
| [oO][fF][fF][iI][cC][eE]
|
||||||
| [oO][kK][iI][nN][aA][wW][aA]
|
| [oO][kK][iI][nN][aA][wW][aA]
|
||||||
| [oO][lL][aA][yY][aA][nN]
|
| [oO][lL][aA][yY][aA][nN]
|
||||||
@ -909,7 +905,6 @@ ASCIITLD = "." (
|
|||||||
| [qQ][uU][eE][sS][tT]
|
| [qQ][uU][eE][sS][tT]
|
||||||
| [rR][aA][cC][iI][nN][gG]
|
| [rR][aA][cC][iI][nN][gG]
|
||||||
| [rR][aA][dD][iI][oO]
|
| [rR][aA][dD][iI][oO]
|
||||||
| [rR][aA][iI][dD]
|
|
||||||
| [rR][eE][aA][dD]
|
| [rR][eE][aA][dD]
|
||||||
| [rR][eE][aA][lL][eE][sS][tT][aA][tT][eE]
|
| [rR][eE][aA][lL][eE][sS][tT][aA][tT][eE]
|
||||||
| [rR][eE][aA][lL][tT][oO][rR]
|
| [rR][eE][aA][lL][tT][oO][rR]
|
||||||
@ -977,7 +972,6 @@ ASCIITLD = "." (
|
|||||||
| [sS][cC][hH][uU][lL][eE]
|
| [sS][cC][hH][uU][lL][eE]
|
||||||
| [sS][cC][hH][wW][aA][rR][zZ]
|
| [sS][cC][hH][wW][aA][rR][zZ]
|
||||||
| [sS][cC][iI][eE][nN][cC][eE]
|
| [sS][cC][iI][eE][nN][cC][eE]
|
||||||
| [sS][cC][jJ][oO][hH][nN][sS][oO][nN]
|
|
||||||
| [sS][cC][oO][tT]
|
| [sS][cC][oO][tT]
|
||||||
| [sS][dD]
|
| [sS][dD]
|
||||||
| [sS][eE][aA][rR][cC][hH]
|
| [sS][eE][aA][rR][cC][hH]
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -58,14 +58,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||||||
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
|
ExtFmtZwj = [\p{WB:Format}\p{WB:Extend}\p{WB:ZWJ}]*
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
|
|
||||||
|
|
||||||
Emoji = \p{Emoji}
|
|
||||||
Emoji_Modifier = \p{Emoji_Modifier}
|
|
||||||
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
|
|
||||||
Extended_Pictographic = \p{Extended_Pictographic}
|
|
||||||
|
|
||||||
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
//
|
//
|
||||||
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
||||||
@ -80,18 +72,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
|
|||||||
|
|
||||||
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
||||||
AccidentalEmoji = [©®™\u3030\u303D]
|
AccidentalEmoji = [©®™\u3030\u303D]
|
||||||
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
|
EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
|
||||||
|
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
|
||||||
|
|
||||||
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
|
EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
|
||||||
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
|
|
||||||
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
|
|
||||||
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
|
|
||||||
|
|
||||||
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
|
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||||
|
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||||
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||||
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
|
||||||
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
|
||||||
|
|
||||||
EmojiPresentationSelector = \uFE0F
|
EmojiPresentationSelector = \uFE0F
|
||||||
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
||||||
|
@ -24,7 +24,6 @@ aeg
|
|||||||
aero
|
aero
|
||||||
aetna
|
aetna
|
||||||
af
|
af
|
||||||
afamilycompany
|
|
||||||
afl
|
afl
|
||||||
africa
|
africa
|
||||||
ag
|
ag
|
||||||
@ -359,7 +358,6 @@ download
|
|||||||
drive
|
drive
|
||||||
dtv
|
dtv
|
||||||
dubai
|
dubai
|
||||||
duck
|
|
||||||
dunlop
|
dunlop
|
||||||
dupont
|
dupont
|
||||||
durban
|
durban
|
||||||
@ -493,7 +491,6 @@ gifts
|
|||||||
gives
|
gives
|
||||||
giving
|
giving
|
||||||
gl
|
gl
|
||||||
glade
|
|
||||||
glass
|
glass
|
||||||
gle
|
gle
|
||||||
global
|
global
|
||||||
@ -877,7 +874,6 @@ nyc
|
|||||||
nz
|
nz
|
||||||
obi
|
obi
|
||||||
observer
|
observer
|
||||||
off
|
|
||||||
office
|
office
|
||||||
okinawa
|
okinawa
|
||||||
olayan
|
olayan
|
||||||
@ -978,7 +974,6 @@ quebec
|
|||||||
quest
|
quest
|
||||||
racing
|
racing
|
||||||
radio
|
radio
|
||||||
raid
|
|
||||||
re
|
re
|
||||||
read
|
read
|
||||||
realestate
|
realestate
|
||||||
@ -1055,7 +1050,6 @@ school
|
|||||||
schule
|
schule
|
||||||
schwarz
|
schwarz
|
||||||
science
|
science
|
||||||
scjohnson
|
|
||||||
scot
|
scot
|
||||||
sd
|
sd
|
||||||
se
|
se
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
||||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
|
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
|
||||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "10dcd0c6e0d91c06ec507e8a3e125d144060bc51"
|
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
|
||||||
}
|
}
|
@ -48,15 +48,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||||||
%char
|
%char
|
||||||
%buffer 255
|
%buffer 255
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// Begin Emoji Macros - see documentation below, near the EMOJI_TYPE rule
|
|
||||||
|
|
||||||
Emoji = \p{Emoji}
|
|
||||||
Emoji_Modifier = \p{Emoji_Modifier}
|
|
||||||
Emoji_Modifier_Base = \p{Emoji_Modifier_Base}
|
|
||||||
Extended_Pictographic = \p{Extended_Pictographic}
|
|
||||||
|
|
||||||
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
// UAX#29 WB4. X (Extend | Format | ZWJ)* --> X
|
||||||
//
|
//
|
||||||
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
// \uFE0E (Text Presentation Selector) and \uFE0F (Emoji Presentation Selector) - included in \p{WB:Extend}
|
||||||
@ -71,18 +62,14 @@ KeyCapEx = {KeyCap} {ExtFmtZwjSansPresSel}
|
|||||||
|
|
||||||
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
// # \u3030 = WAVY DASH; \u303D = PART ALTERNATION MARK
|
||||||
AccidentalEmoji = [©®™\u3030\u303D]
|
AccidentalEmoji = [©®™\u3030\u303D]
|
||||||
EmojiRKAM = ( \p{WB:Regional_Indicator} | {KeyCapBaseChar} | {AccidentalEmoji} | {Emoji_Modifier} )
|
EmojiRKAM = [\p{WB:Regional_Indicator}{KeyCapBaseChar}{AccidentalEmoji}\p{Emoji_Modifier}]
|
||||||
|
EmojiSansRKAM = [\p{Emoji}--{EmojiRKAM}]
|
||||||
|
|
||||||
// Unlike Unicode properties, macros are not allowed in character classes, so we achieve set difference
|
EmojiChar = ( \p{Extended_Pictographic} | {EmojiSansRKAM} )
|
||||||
// by applying DeMorgan: the expression that matches everything of 'a' not matched by 'b' is: !(!a|b)
|
|
||||||
// TODO: Convert this expression to character class difference when JFlex supports the properties directly (in Unicode 11.0+)
|
|
||||||
EmojiSansRKAM = !( ! {Emoji} | {EmojiRKAM} )
|
|
||||||
|
|
||||||
EmojiChar = ( {Extended_Pictographic} | {EmojiSansRKAM} )
|
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
||||||
|
EmojiModifierBaseEx = \p{Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
||||||
EmojiCharEx = {EmojiChar} {ExtFmtZwjSansPresSel}
|
EmojiModifierEx = \p{Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
||||||
EmojiModifierBaseEx = {Emoji_Modifier_Base} {ExtFmtZwjSansPresSel}
|
|
||||||
EmojiModifierEx = {Emoji_Modifier} {ExtFmtZwjSansPresSel}
|
|
||||||
|
|
||||||
EmojiPresentationSelector = \uFE0F
|
EmojiPresentationSelector = \uFE0F
|
||||||
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
EmojiCharOrPresSeqOrModSeq = ( \p{WB:ZWJ}* {EmojiCharEx} {EmojiPresentationSelector}? ) | ( ( \p{WB:ZWJ}* {EmojiModifierBaseEx} )? {EmojiModifierEx} )
|
||||||
|
Loading…
x
Reference in New Issue
Block a user