FIX: Handle all UTF-8 characters (#21344)
Watched words were converted to regular expressions containing \W, which handled only ASCII characters. Using [^[:word]] instead ensures that UTF-8 characters are also handled correctly.
This commit is contained in:
parent
23a146a7c6
commit
9a2780397f
|
@ -1,6 +1,6 @@
|
|||
export function createWatchedWordRegExp(word) {
|
||||
const caseFlag = word.case_sensitive ? "" : "i";
|
||||
return new RegExp(word.regexp, `${caseFlag}g`);
|
||||
return new RegExp(word.regexp, `${caseFlag}gu`);
|
||||
}
|
||||
|
||||
export function toWatchedWord(regexp) {
|
||||
|
|
|
@ -205,7 +205,7 @@ class SiteSerializer < ApplicationSerializer
|
|||
end
|
||||
|
||||
def censored_regexp
|
||||
WordWatcher.serializable_word_matcher_regexp(:censor)
|
||||
WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js)
|
||||
end
|
||||
|
||||
def custom_emoji_translation
|
||||
|
@ -221,11 +221,11 @@ class SiteSerializer < ApplicationSerializer
|
|||
end
|
||||
|
||||
def watched_words_replace
|
||||
WordWatcher.word_matcher_regexps(:replace)
|
||||
WordWatcher.word_matcher_regexps(:replace, engine: :js)
|
||||
end
|
||||
|
||||
def watched_words_link
|
||||
WordWatcher.word_matcher_regexps(:link)
|
||||
WordWatcher.word_matcher_regexps(:link, engine: :js)
|
||||
end
|
||||
|
||||
def categories
|
||||
|
|
|
@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
|
|||
attributes :id, :word, :regexp, :replacement, :action, :case_sensitive
|
||||
|
||||
def regexp
|
||||
WordWatcher.word_to_regexp(word, whole: true)
|
||||
WordWatcher.word_to_regexp(word)
|
||||
end
|
||||
|
||||
def action
|
||||
|
|
|
@ -44,23 +44,23 @@ class WordWatcher
|
|||
end
|
||||
end
|
||||
|
||||
def self.serializable_word_matcher_regexp(action)
|
||||
word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
|
||||
def self.serializable_word_matcher_regexp(action, engine: :ruby)
|
||||
word_matcher_regexp_list(action, engine: engine).map do |r|
|
||||
{ r.source => { case_sensitive: !r.casefold? } }
|
||||
end
|
||||
end
|
||||
|
||||
# This regexp is run in miniracer, and the client JS app
|
||||
# Make sure it is compatible with major browsers when changing
|
||||
# hint: non-chrome browsers do not support 'lookbehind'
|
||||
def self.word_matcher_regexp_list(action, raise_errors: false)
|
||||
def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
|
||||
words = get_cached_words(action)
|
||||
return [] if words.blank?
|
||||
|
||||
grouped_words = { case_sensitive: [], case_insensitive: [] }
|
||||
|
||||
words.each do |w, attrs|
|
||||
word = word_to_regexp(w)
|
||||
word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
|
||||
|
||||
words.each do |word, attrs|
|
||||
word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
|
||||
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
|
||||
grouped_words[group_key] << word
|
||||
end
|
||||
|
@ -68,10 +68,7 @@ class WordWatcher
|
|||
regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }
|
||||
|
||||
if !SiteSetting.watched_words_regular_expressions?
|
||||
regexps.transform_values! do |regexp|
|
||||
regexp = "(#{regexp})"
|
||||
"(?:\\W|^)#{regexp}(?=\\W|$)"
|
||||
end
|
||||
regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
|
||||
end
|
||||
|
||||
regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
|
||||
|
@ -80,29 +77,42 @@ class WordWatcher
|
|||
[] # Admin will be alerted via admin_dashboard_data.rb
|
||||
end
|
||||
|
||||
def self.word_matcher_regexps(action)
|
||||
def self.word_matcher_regexps(action, engine: :ruby)
|
||||
if words = get_cached_words(action)
|
||||
words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
|
||||
words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
|
||||
end
|
||||
end
|
||||
|
||||
def self.word_to_regexp(word, whole: false)
|
||||
def self.word_to_regexp(word, engine: :ruby, whole: true)
|
||||
if SiteSetting.watched_words_regular_expressions?
|
||||
# Strip ruby regexp format if present
|
||||
# Strip Ruby regexp format if present
|
||||
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
|
||||
regexp = "(#{regexp})" if whole
|
||||
return regexp
|
||||
end
|
||||
|
||||
regexp = Regexp.escape(word).gsub("\\*", '\S*')
|
||||
# Escape regular expression. Avoid using Regexp.escape because it escapes
|
||||
# more characters than it should (for example, whitespaces)
|
||||
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
|
||||
|
||||
if whole && !SiteSetting.watched_words_regular_expressions?
|
||||
regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
|
||||
end
|
||||
# Handle wildcards
|
||||
regexp = regexp.gsub("\\*", '\S*')
|
||||
|
||||
regexp = wrap_regexp(regexp, engine: engine) if whole
|
||||
|
||||
regexp
|
||||
end
|
||||
|
||||
def self.wrap_regexp(regexp, engine: :ruby)
|
||||
if engine == :js
|
||||
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
|
||||
elsif engine == :ruby
|
||||
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
|
||||
else
|
||||
"(?:\\W|^)(#{regexp})(?=\\W|$)"
|
||||
end
|
||||
end
|
||||
|
||||
def self.word_matcher_regexp_key(action)
|
||||
"watched-words-list:v#{CACHE_VERSION}:#{action}"
|
||||
end
|
||||
|
@ -212,10 +222,8 @@ class WordWatcher
|
|||
end
|
||||
|
||||
def word_matches?(word, case_sensitive: false)
|
||||
Regexp.new(
|
||||
WordWatcher.word_to_regexp(word, whole: true),
|
||||
case_sensitive ? nil : Regexp::IGNORECASE,
|
||||
).match?(@raw)
|
||||
options = case_sensitive ? nil : Regexp::IGNORECASE
|
||||
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
|
||||
end
|
||||
|
||||
def self.replace_text_with_regexp(text, regexp, replacement)
|
||||
|
|
|
@ -204,9 +204,9 @@ module PrettyText
|
|||
__optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
|
||||
__optInput.emojiDenyList = #{Emoji.denied.to_json};
|
||||
__optInput.lookupUploadUrls = __lookupUploadUrls;
|
||||
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json};
|
||||
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json};
|
||||
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json};
|
||||
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json};
|
||||
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json};
|
||||
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json};
|
||||
__optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
|
||||
JS
|
||||
|
||||
|
|
|
@ -64,6 +64,14 @@ RSpec.describe WatchedWord do
|
|||
should_block_post(manager)
|
||||
end
|
||||
|
||||
it "should handle UTF-8 characters" do
|
||||
block_word = Fabricate(:watched_word, action: WatchedWord.actions[:block], word: "abc")
|
||||
manager =
|
||||
NewPostManager.new(tl2_user, title: "Hello world", raw: "abcódef", topic_id: topic.id)
|
||||
|
||||
expect(manager.perform).to be_success
|
||||
end
|
||||
|
||||
it "should block the post from admin" do
|
||||
manager =
|
||||
NewPostManager.new(
|
||||
|
|
|
@ -79,8 +79,8 @@ RSpec.describe WordWatcher do
|
|||
|
||||
expect(regexps).to be_an(Array)
|
||||
expect(regexps.map(&:inspect)).to contain_exactly(
|
||||
"/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i",
|
||||
"/(?:\\W|^)(#{word3}|#{word4})(?=\\W|$)/",
|
||||
"/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i",
|
||||
"/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/",
|
||||
)
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue