FIX: Handle all UTF-8 characters (#21344)

Watched words were converted to regular expressions containing \W, which handled only ASCII characters. Using [^[:word]] instead ensures that UTF-8 characters are also handled correctly.
2023-05-15 11:45:04 +02:00 · 2023-05-15 11:45:04 +02:00 · 9a2780397f
parent 23a146a7c6
commit 9a2780397f
7 changed files with 49 additions and 33 deletions
--- a/app/assets/javascripts/discourse-common/addon/utils/watched-words.js
+++ b/app/assets/javascripts/discourse-common/addon/utils/watched-words.js
@ -1,6 +1,6 @@
 export function createWatchedWordRegExp(word) {
  const caseFlag = word.case_sensitive ? "" : "i";
-  return new RegExp(word.regexp, `${caseFlag}g`);
+  return new RegExp(word.regexp, `${caseFlag}gu`);
 }

 export function toWatchedWord(regexp) {
--- a/app/serializers/site_serializer.rb
+++ b/app/serializers/site_serializer.rb
@ -205,7 +205,7 @@ class SiteSerializer < ApplicationSerializer
  end

  def censored_regexp
-    WordWatcher.serializable_word_matcher_regexp(:censor)
+    WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js)
  end

  def custom_emoji_translation
@ -221,11 +221,11 @@ class SiteSerializer < ApplicationSerializer
  end

  def watched_words_replace
-    WordWatcher.word_matcher_regexps(:replace)
+    WordWatcher.word_matcher_regexps(:replace, engine: :js)
  end

  def watched_words_link
-    WordWatcher.word_matcher_regexps(:link)
+    WordWatcher.word_matcher_regexps(:link, engine: :js)
  end

  def categories
--- a/app/serializers/watched_word_serializer.rb
+++ b/app/serializers/watched_word_serializer.rb
@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
  attributes :id, :word, :regexp, :replacement, :action, :case_sensitive

  def regexp
-    WordWatcher.word_to_regexp(word, whole: true)
+    WordWatcher.word_to_regexp(word)
  end

  def action
--- a/app/services/word_watcher.rb
+++ b/app/services/word_watcher.rb
@ -44,23 +44,23 @@ class WordWatcher
    end
  end

-  def self.serializable_word_matcher_regexp(action)
-    word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
+  def self.serializable_word_matcher_regexp(action, engine: :ruby)
+    word_matcher_regexp_list(action, engine: engine).map do |r|
+      { r.source => { case_sensitive: !r.casefold? } }
+    end
  end

  # This regexp is run in miniracer, and the client JS app
  # Make sure it is compatible with major browsers when changing
  # hint: non-chrome browsers do not support 'lookbehind'
-  def self.word_matcher_regexp_list(action, raise_errors: false)
+  def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
    words = get_cached_words(action)
    return [] if words.blank?

    grouped_words = { case_sensitive: [], case_insensitive: [] }

-    words.each do |w, attrs|
-      word = word_to_regexp(w)
-      word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
-
+    words.each do |word, attrs|
+      word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
      group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
      grouped_words[group_key] << word
    end
@ -68,10 +68,7 @@ class WordWatcher
    regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }

    if !SiteSetting.watched_words_regular_expressions?
-      regexps.transform_values! do |regexp|
-        regexp = "(#{regexp})"
-        "(?:\\W|^)#{regexp}(?=\\W|$)"
-      end
+      regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
    end

    regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
@ -80,29 +77,42 @@ class WordWatcher
    [] # Admin will be alerted via admin_dashboard_data.rb
  end

-  def self.word_matcher_regexps(action)
+  def self.word_matcher_regexps(action, engine: :ruby)
    if words = get_cached_words(action)
-      words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
+      words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
    end
  end

-  def self.word_to_regexp(word, whole: false)
+  def self.word_to_regexp(word, engine: :ruby, whole: true)
    if SiteSetting.watched_words_regular_expressions?
-      # Strip ruby regexp format if present
+      # Strip Ruby regexp format if present
      regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
      regexp = "(#{regexp})" if whole
      return regexp
    end

-    regexp = Regexp.escape(word).gsub("\\*", '\S*')
+    # Escape regular expression. Avoid using Regexp.escape because it escapes
+    # more characters than it should (for example, whitespaces)
+    regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')

-    if whole && !SiteSetting.watched_words_regular_expressions?
-      regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
-    end
+    # Handle wildcards
+    regexp = regexp.gsub("\\*", '\S*')
+
+    regexp = wrap_regexp(regexp, engine: engine) if whole

    regexp
  end

+  def self.wrap_regexp(regexp, engine: :ruby)
+    if engine == :js
+      "(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
+    elsif engine == :ruby
+      "(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
+    else
+      "(?:\\W|^)(#{regexp})(?=\\W|$)"
+    end
+  end
+
  def self.word_matcher_regexp_key(action)
    "watched-words-list:v#{CACHE_VERSION}:#{action}"
  end
@ -212,10 +222,8 @@ class WordWatcher
  end

  def word_matches?(word, case_sensitive: false)
-    Regexp.new(
-      WordWatcher.word_to_regexp(word, whole: true),
-      case_sensitive ? nil : Regexp::IGNORECASE,
-    ).match?(@raw)
+    options = case_sensitive ? nil : Regexp::IGNORECASE
+    Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
  end

  def self.replace_text_with_regexp(text, regexp, replacement)
--- a/lib/pretty_text.rb
+++ b/lib/pretty_text.rb
@ -204,9 +204,9 @@ module PrettyText
        __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
        __optInput.emojiDenyList = #{Emoji.denied.to_json};
        __optInput.lookupUploadUrls = __lookupUploadUrls;
-        __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json};
-        __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json};
-        __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json};
+        __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json};
+        __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json};
+        __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json};
        __optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
      JS

--- a/spec/integration/watched_words_spec.rb
+++ b/spec/integration/watched_words_spec.rb
@ -64,6 +64,14 @@ RSpec.describe WatchedWord do
      should_block_post(manager)
    end

+    it "should handle UTF-8 characters" do
+      block_word = Fabricate(:watched_word, action: WatchedWord.actions[:block], word: "abc")
+      manager =
+        NewPostManager.new(tl2_user, title: "Hello world", raw: "abcódef", topic_id: topic.id)
+
+      expect(manager.perform).to be_success
+    end
+
    it "should block the post from admin" do
      manager =
        NewPostManager.new(
--- a/spec/services/word_watcher_spec.rb
+++ b/spec/services/word_watcher_spec.rb
@ -79,8 +79,8 @@ RSpec.describe WordWatcher do

        expect(regexps).to be_an(Array)
        expect(regexps.map(&:inspect)).to contain_exactly(
-          "/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i",
-          "/(?:\\W|^)(#{word3}|#{word4})(?=\\W|$)/",
+          "/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i",
+          "/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/",
        )
      end