FIX: Handle all UTF-8 characters (#21344)

Watched words were converted to regular expressions containing \W, which
handled only ASCII characters. Using [^[:word]] instead ensures that
UTF-8 characters are also handled correctly.
This commit is contained in:
Bianca Nenciu 2023-05-15 11:45:04 +02:00 committed by GitHub
parent 23a146a7c6
commit 9a2780397f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 49 additions and 33 deletions

View File

@ -1,6 +1,6 @@
export function createWatchedWordRegExp(word) {
const caseFlag = word.case_sensitive ? "" : "i";
return new RegExp(word.regexp, `${caseFlag}g`);
return new RegExp(word.regexp, `${caseFlag}gu`);
}
export function toWatchedWord(regexp) {

View File

@ -205,7 +205,7 @@ class SiteSerializer < ApplicationSerializer
end
def censored_regexp
WordWatcher.serializable_word_matcher_regexp(:censor)
WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js)
end
def custom_emoji_translation
@ -221,11 +221,11 @@ class SiteSerializer < ApplicationSerializer
end
def watched_words_replace
WordWatcher.word_matcher_regexps(:replace)
WordWatcher.word_matcher_regexps(:replace, engine: :js)
end
def watched_words_link
WordWatcher.word_matcher_regexps(:link)
WordWatcher.word_matcher_regexps(:link, engine: :js)
end
def categories

View File

@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
attributes :id, :word, :regexp, :replacement, :action, :case_sensitive
def regexp
WordWatcher.word_to_regexp(word, whole: true)
WordWatcher.word_to_regexp(word)
end
def action

View File

@ -44,23 +44,23 @@ class WordWatcher
end
end
def self.serializable_word_matcher_regexp(action)
word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
def self.serializable_word_matcher_regexp(action, engine: :ruby)
word_matcher_regexp_list(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end
# This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind'
def self.word_matcher_regexp_list(action, raise_errors: false)
def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
words = get_cached_words(action)
return [] if words.blank?
grouped_words = { case_sensitive: [], case_insensitive: [] }
words.each do |w, attrs|
word = word_to_regexp(w)
word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
words.each do |word, attrs|
word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
grouped_words[group_key] << word
end
@ -68,10 +68,7 @@ class WordWatcher
regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }
if !SiteSetting.watched_words_regular_expressions?
regexps.transform_values! do |regexp|
regexp = "(#{regexp})"
"(?:\\W|^)#{regexp}(?=\\W|$)"
end
regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
end
regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
@ -80,29 +77,42 @@ class WordWatcher
[] # Admin will be alerted via admin_dashboard_data.rb
end
def self.word_matcher_regexps(action)
def self.word_matcher_regexps(action, engine: :ruby)
if words = get_cached_words(action)
words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
end
end
def self.word_to_regexp(word, whole: false)
def self.word_to_regexp(word, engine: :ruby, whole: true)
if SiteSetting.watched_words_regular_expressions?
# Strip ruby regexp format if present
# Strip Ruby regexp format if present
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
regexp = "(#{regexp})" if whole
return regexp
end
regexp = Regexp.escape(word).gsub("\\*", '\S*')
# Escape regular expression. Avoid using Regexp.escape because it escapes
# more characters than it should (for example, whitespaces)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
if whole && !SiteSetting.watched_words_regular_expressions?
regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
end
# Handle wildcards
regexp = regexp.gsub("\\*", '\S*')
regexp = wrap_regexp(regexp, engine: engine) if whole
regexp
end
def self.wrap_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
"(?:\\W|^)(#{regexp})(?=\\W|$)"
end
end
def self.word_matcher_regexp_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
@ -212,10 +222,8 @@ class WordWatcher
end
def word_matches?(word, case_sensitive: false)
Regexp.new(
WordWatcher.word_to_regexp(word, whole: true),
case_sensitive ? nil : Regexp::IGNORECASE,
).match?(@raw)
options = case_sensitive ? nil : Regexp::IGNORECASE
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
end
def self.replace_text_with_regexp(text, regexp, replacement)

View File

@ -204,9 +204,9 @@ module PrettyText
__optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
__optInput.emojiDenyList = #{Emoji.denied.to_json};
__optInput.lookupUploadUrls = __lookupUploadUrls;
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json};
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json};
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json};
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json};
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json};
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json};
__optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
JS

View File

@ -64,6 +64,14 @@ RSpec.describe WatchedWord do
should_block_post(manager)
end
it "should handle UTF-8 characters" do
block_word = Fabricate(:watched_word, action: WatchedWord.actions[:block], word: "abc")
manager =
NewPostManager.new(tl2_user, title: "Hello world", raw: "abcódef", topic_id: topic.id)
expect(manager.perform).to be_success
end
it "should block the post from admin" do
manager =
NewPostManager.new(

View File

@ -79,8 +79,8 @@ RSpec.describe WordWatcher do
expect(regexps).to be_an(Array)
expect(regexps.map(&:inspect)).to contain_exactly(
"/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i",
"/(?:\\W|^)(#{word3}|#{word4})(?=\\W|$)/",
"/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i",
"/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/",
)
end