PERF: improve `findAllMatches` speed (#22083)

When we introduced unicode support in the regular expressions used in watched words (9a27803) we didn't realize the cost adding the `u` flag would be.

Turns out, it's pretty bad when you have lots of regular expressions to test. A customer had slightly less than 200 watched words, and it would freeze the browser for about 2s on the first check of those regular expressions (roughly 10ms per regular expression).

This commit introduces a new field (`word`) to the serialized watched words which is then converted to a very fast and cheap regular expression on the client-side. We use that regexp to quicly check whether a matcher is even worth trying so that we don't incure the cost of compiling the expensive unicode regexp.

This commit also busts the `WordWatcher` cache since we added a new field to be serialized.

One nice side effect of using `matchAll` instead of a `while / exec` loop is that the likeliness of having a bad regexp matching infinitely is vastly reduced 🙌
This commit is contained in:
Régis Hanol 2023-06-13 18:34:28 +02:00 committed by GitHub
parent 367b3be035
commit 4cb3412a56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 26 deletions

View File

@ -1683,6 +1683,7 @@ var bar = 'bar';
const opts = { const opts = {
watchedWordsReplace: { watchedWordsReplace: {
"(?:\\W|^)(fun)(?=\\W|$)": { "(?:\\W|^)(fun)(?=\\W|$)": {
word: "fun",
replacement: "times", replacement: "times",
case_sensitive: false, case_sensitive: false,
}, },
@ -1697,6 +1698,7 @@ var bar = 'bar';
const opts = { const opts = {
watchedWordsLink: { watchedWordsLink: {
"(?:\\W|^)(fun)(?=\\W|$)": { "(?:\\W|^)(fun)(?=\\W|$)": {
word: "fun",
replacement: "https://discourse.org", replacement: "https://discourse.org",
case_sensitive: false, case_sensitive: false,
}, },
@ -1711,18 +1713,21 @@ var bar = 'bar';
}); });
test("watched words replace with bad regex", function (assert) { test("watched words replace with bad regex", function (assert) {
const maxMatches = 100; // same limit as MD watched-words-replace plugin
const opts = { const opts = {
siteSettings: { watched_words_regular_expressions: true }, siteSettings: { watched_words_regular_expressions: true },
watchedWordsReplace: { watchedWordsReplace: {
"(\\bu?\\b)": { replacement: "you", case_sensitive: false }, "(\\bu?\\b)": {
word: "(\\bu?\\b)",
replacement: "you",
case_sensitive: false,
},
}, },
}; };
assert.cookedOptions( assert.cookedOptions(
"one", "one",
opts, opts,
`<p>${"you".repeat(maxMatches)}one</p>`, `<p>youoneyou</p>`,
"does not loop infinitely" "does not loop infinitely"
); );
}); });

View File

@ -16,22 +16,26 @@ function isLinkClose(str) {
function findAllMatches(text, matchers) { function findAllMatches(text, matchers) {
const matches = []; const matches = [];
let count = 0; for (const { word, pattern, replacement, link } of matchers) {
if (matches.length >= MAX_MATCHES) {
matchers.forEach((matcher) => { break;
let match;
while (
(match = matcher.pattern.exec(text)) !== null &&
count++ < MAX_MATCHES
) {
matches.push({
index: match.index + match[0].indexOf(match[1]),
text: match[1],
replacement: matcher.replacement,
link: matcher.link,
});
} }
});
if (word.test(text)) {
for (const match of text.matchAll(pattern)) {
matches.push({
index: match.index + match[0].indexOf(match[1]),
text: match[1],
replacement,
link,
});
if (matches.length >= MAX_MATCHES) {
break;
}
}
}
}
return matches.sort((a, b) => a.index - b.index); return matches.sort((a, b) => a.index - b.index);
} }
@ -52,11 +56,12 @@ export function setup(helper) {
const matchers = []; const matchers = [];
if (md.options.discourse.watchedWordsReplace) { if (md.options.discourse.watchedWordsReplace) {
Object.entries(md.options.discourse.watchedWordsReplace).map( Object.entries(md.options.discourse.watchedWordsReplace).forEach(
([regexpString, options]) => { ([regexpString, options]) => {
const word = toWatchedWord({ [regexpString]: options }); const word = toWatchedWord({ [regexpString]: options });
matchers.push({ matchers.push({
word: new RegExp(options.word, options.case_sensitive ? "" : "i"),
pattern: createWatchedWordRegExp(word), pattern: createWatchedWordRegExp(word),
replacement: options.replacement, replacement: options.replacement,
link: false, link: false,
@ -66,11 +71,12 @@ export function setup(helper) {
} }
if (md.options.discourse.watchedWordsLink) { if (md.options.discourse.watchedWordsLink) {
Object.entries(md.options.discourse.watchedWordsLink).map( Object.entries(md.options.discourse.watchedWordsLink).forEach(
([regexpString, options]) => { ([regexpString, options]) => {
const word = toWatchedWord({ [regexpString]: options }); const word = toWatchedWord({ [regexpString]: options });
matchers.push({ matchers.push({
word: new RegExp(options.word, options.case_sensitive ? "" : "i"),
pattern: createWatchedWordRegExp(word), pattern: createWatchedWordRegExp(word),
replacement: options.replacement, replacement: options.replacement,
link: true, link: true,

View File

@ -2,7 +2,7 @@
class WordWatcher class WordWatcher
REPLACEMENT_LETTER ||= CGI.unescape_html("&#9632;") REPLACEMENT_LETTER ||= CGI.unescape_html("&#9632;")
CACHE_VERSION = 2 CACHE_VERSION ||= 3
def initialize(raw) def initialize(raw)
@raw = raw @raw = raw
@ -24,8 +24,9 @@ class WordWatcher
.limit(WatchedWord::MAX_WORDS_PER_ACTION) .limit(WatchedWord::MAX_WORDS_PER_ACTION)
.order(:id) .order(:id)
.pluck(:word, :replacement, :case_sensitive) .pluck(:word, :replacement, :case_sensitive)
.map { |w, r, c| [w, { replacement: r, case_sensitive: c }.compact] } .to_h do |w, r, c|
.to_h [w, { word: word_to_regexp(w, whole: false), replacement: r, case_sensitive: c }.compact]
end
end end
def self.words_for_action_exists?(action) def self.words_for_action_exists?(action)
@ -78,9 +79,7 @@ class WordWatcher
end end
def self.word_matcher_regexps(action, engine: :ruby) def self.word_matcher_regexps(action, engine: :ruby)
if words = get_cached_words(action) get_cached_words(action)&.to_h { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }
words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
end
end end
def self.word_to_regexp(word, engine: :ruby, whole: true) def self.word_to_regexp(word, engine: :ruby, whole: true)

View File

@ -21,9 +21,11 @@ RSpec.describe WordWatcher do
expect(described_class.words_for_action(:block)).to include( expect(described_class.words_for_action(:block)).to include(
word1 => { word1 => {
case_sensitive: false, case_sensitive: false,
word: word1,
}, },
word2 => { word2 => {
case_sensitive: true, case_sensitive: true,
word: word2,
}, },
) )
end end
@ -40,6 +42,7 @@ RSpec.describe WordWatcher do
word => { word => {
case_sensitive: false, case_sensitive: false,
replacement: "http://test.localhost/", replacement: "http://test.localhost/",
word: word,
}, },
) )
end end