PERF: improve `findAllMatches` speed (#22083)

When we introduced unicode support in the regular expressions used in watched words (9a27803) we didn't realize the cost adding the `u` flag would be.

Turns out, it's pretty bad when you have lots of regular expressions to test. A customer had slightly less than 200 watched words, and it would freeze the browser for about 2s on the first check of those regular expressions (roughly 10ms per regular expression).

This commit introduces a new field (`word`) to the serialized watched words which is then converted to a very fast and cheap regular expression on the client-side. We use that regexp to quicly check whether a matcher is even worth trying so that we don't incure the cost of compiling the expensive unicode regexp.

This commit also busts the `WordWatcher` cache since we added a new field to be serialized.

One nice side effect of using `matchAll` instead of a `while / exec` loop is that the likeliness of having a bad regexp matching infinitely is vastly reduced 🙌
This commit is contained in:
Régis Hanol 2023-06-13 18:34:28 +02:00 committed by GitHub
parent 367b3be035
commit 4cb3412a56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 26 deletions

View File

@ -1683,6 +1683,7 @@ var bar = 'bar';
const opts = {
watchedWordsReplace: {
"(?:\\W|^)(fun)(?=\\W|$)": {
word: "fun",
replacement: "times",
case_sensitive: false,
},
@ -1697,6 +1698,7 @@ var bar = 'bar';
const opts = {
watchedWordsLink: {
"(?:\\W|^)(fun)(?=\\W|$)": {
word: "fun",
replacement: "https://discourse.org",
case_sensitive: false,
},
@ -1711,18 +1713,21 @@ var bar = 'bar';
});
test("watched words replace with bad regex", function (assert) {
const maxMatches = 100; // same limit as MD watched-words-replace plugin
const opts = {
siteSettings: { watched_words_regular_expressions: true },
watchedWordsReplace: {
"(\\bu?\\b)": { replacement: "you", case_sensitive: false },
"(\\bu?\\b)": {
word: "(\\bu?\\b)",
replacement: "you",
case_sensitive: false,
},
},
};
assert.cookedOptions(
"one",
opts,
`<p>${"you".repeat(maxMatches)}one</p>`,
`<p>youoneyou</p>`,
"does not loop infinitely"
);
});

View File

@ -16,22 +16,26 @@ function isLinkClose(str) {
function findAllMatches(text, matchers) {
const matches = [];
let count = 0;
matchers.forEach((matcher) => {
let match;
while (
(match = matcher.pattern.exec(text)) !== null &&
count++ < MAX_MATCHES
) {
matches.push({
index: match.index + match[0].indexOf(match[1]),
text: match[1],
replacement: matcher.replacement,
link: matcher.link,
});
for (const { word, pattern, replacement, link } of matchers) {
if (matches.length >= MAX_MATCHES) {
break;
}
});
if (word.test(text)) {
for (const match of text.matchAll(pattern)) {
matches.push({
index: match.index + match[0].indexOf(match[1]),
text: match[1],
replacement,
link,
});
if (matches.length >= MAX_MATCHES) {
break;
}
}
}
}
return matches.sort((a, b) => a.index - b.index);
}
@ -52,11 +56,12 @@ export function setup(helper) {
const matchers = [];
if (md.options.discourse.watchedWordsReplace) {
Object.entries(md.options.discourse.watchedWordsReplace).map(
Object.entries(md.options.discourse.watchedWordsReplace).forEach(
([regexpString, options]) => {
const word = toWatchedWord({ [regexpString]: options });
matchers.push({
word: new RegExp(options.word, options.case_sensitive ? "" : "i"),
pattern: createWatchedWordRegExp(word),
replacement: options.replacement,
link: false,
@ -66,11 +71,12 @@ export function setup(helper) {
}
if (md.options.discourse.watchedWordsLink) {
Object.entries(md.options.discourse.watchedWordsLink).map(
Object.entries(md.options.discourse.watchedWordsLink).forEach(
([regexpString, options]) => {
const word = toWatchedWord({ [regexpString]: options });
matchers.push({
word: new RegExp(options.word, options.case_sensitive ? "" : "i"),
pattern: createWatchedWordRegExp(word),
replacement: options.replacement,
link: true,

View File

@ -2,7 +2,7 @@
class WordWatcher
REPLACEMENT_LETTER ||= CGI.unescape_html("&#9632;")
CACHE_VERSION = 2
CACHE_VERSION ||= 3
def initialize(raw)
@raw = raw
@ -24,8 +24,9 @@ class WordWatcher
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
.order(:id)
.pluck(:word, :replacement, :case_sensitive)
.map { |w, r, c| [w, { replacement: r, case_sensitive: c }.compact] }
.to_h
.to_h do |w, r, c|
[w, { word: word_to_regexp(w, whole: false), replacement: r, case_sensitive: c }.compact]
end
end
def self.words_for_action_exists?(action)
@ -78,9 +79,7 @@ class WordWatcher
end
def self.word_matcher_regexps(action, engine: :ruby)
if words = get_cached_words(action)
words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
end
get_cached_words(action)&.to_h { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }
end
def self.word_to_regexp(word, engine: :ruby, whole: true)

View File

@ -21,9 +21,11 @@ RSpec.describe WordWatcher do
expect(described_class.words_for_action(:block)).to include(
word1 => {
case_sensitive: false,
word: word1,
},
word2 => {
case_sensitive: true,
word: word2,
},
)
end
@ -40,6 +42,7 @@ RSpec.describe WordWatcher do
word => {
case_sensitive: false,
replacement: "http://test.localhost/",
word: word,
},
)
end