FIX: Add word boundaries to replace and tag watched words (#13405)

The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word. For example, if "art" was a watched word a post containing word "artist" matched.
2021-06-18 18:54:06 +03:00 · 2021-06-18 18:54:06 +03:00 · 74f7295631
parent 4afd8f9bdf
commit 74f7295631
8 changed files with 38 additions and 18 deletions
--- a/app/assets/javascripts/discourse/tests/acceptance/admin-watched-words-test.js
+++ b/app/assets/javascripts/discourse/tests/acceptance/admin-watched-words-test.js
@ -118,7 +118,6 @@ acceptance("Admin - Watched Words - Bad regular expressions", function (needs) {
            action: "block",
          },
        ],
-        regular_expressions: true,
        compiled_regular_expressions: {
          block: null,
          censor: null,
--- a/app/assets/javascripts/discourse/tests/fixtures/watched-words-fixtures.js
+++ b/app/assets/javascripts/discourse/tests/fixtures/watched-words-fixtures.js
@ -11,14 +11,14 @@ export default {
      {
        id: 7,
        word: "hi",
-        regexp: "hi",
+        regexp: "(hi)",
        replacement: "hello",
        action: "replace",
      },
      {
        id: 8,
        word: "hello",
-        regexp: "hello",
+        regexp: "(hello)",
        replacement: "greeting",
        action: "tag",
      },
--- a/app/assets/javascripts/discourse/tests/unit/lib/pretty-text-test.js
+++ b/app/assets/javascripts/discourse/tests/unit/lib/pretty-text-test.js
@ -1675,21 +1675,21 @@ var bar = 'bar';

  test("watched words replace", function (assert) {
    const opts = {
-      watchedWordsReplace: { fun: "times" },
+      watchedWordsReplace: { "(?:\\W|^)(fun)(?=\\W|$)": "times" },
    };

-    assert.cookedOptions("test fun", opts, "<p>test times</p>");
+    assert.cookedOptions("test fun funny", opts, "<p>test times funny</p>");
  });

  test("watched words link", function (assert) {
    const opts = {
-      watchedWordsLink: { fun: "https://discourse.org" },
+      watchedWordsLink: { "(?:\\W|^)(fun)(?=\\W|$)": "https://discourse.org" },
    };

    assert.cookedOptions(
-      "test fun",
+      "test fun funny",
      opts,
-      '<p>test <a href="https://discourse.org">fun</a></p>'
+      '<p>test <a href="https://discourse.org">fun</a> funny</p>'
    );
  });

@ -1697,7 +1697,7 @@ var bar = 'bar';
    const maxMatches = 100; // same limit as MD watched-words-replace plugin
    const opts = {
      siteSettings: { watched_words_regular_expressions: true },
-      watchedWordsReplace: { "\\bu?\\b": "you" },
+      watchedWordsReplace: { "(\\bu?\\b)": "you" },
    };

    assert.cookedOptions(
--- a/app/assets/javascripts/pretty-text/engines/discourse-markdown/watched-words.js
+++ b/app/assets/javascripts/pretty-text/engines/discourse-markdown/watched-words.js
@ -20,8 +20,8 @@ function findAllMatches(text, matchers) {
      count++ < MAX_MATCHES
    ) {
      matches.push({
-        index: match.index,
-        text: match[0],
+        index: match.index + match[0].indexOf(match[1]),
+        text: match[1],
        replacement: matcher.replacement,
        link: matcher.link,
      });
--- a/app/serializers/watched_word_serializer.rb
+++ b/app/serializers/watched_word_serializer.rb
@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
  attributes :id, :word, :regexp, :replacement, :action

  def regexp
-    WordWatcher.word_to_regexp(word)
+    WordWatcher.word_to_regexp(word, whole: true)
  end

  def action
--- a/app/services/word_watcher.rb
+++ b/app/services/word_watcher.rb
@ -54,17 +54,26 @@ class WordWatcher

  def self.word_matcher_regexps(action)
    if words = get_cached_words(action)
-      words.map { |w, r| [word_to_regexp(w), r] }.to_h
+      words.map { |w, r| [word_to_regexp(w, whole: true), r] }.to_h
    end
  end

-  def self.word_to_regexp(word)
+  def self.word_to_regexp(word, whole: false)
    if SiteSetting.watched_words_regular_expressions?
      # Strip ruby regexp format if present, we're going to make the whole thing
      # case insensitive anyway
-      return word.start_with?("(?-mix:") ? word[7..-2] : word
+      regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
+      regexp = "(#{regexp})" if whole
+      return regexp
    end
-    Regexp.escape(word).gsub("\\*", '\S*')
+
+    regexp = Regexp.escape(word).gsub("\\*", '\S*')
+
+    if whole && !SiteSetting.watched_words_regular_expressions?
+      regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
+    end
+
+    regexp
  end

  def self.word_matcher_regexp_key(action)
@ -144,6 +153,6 @@ class WordWatcher
  end

  def word_matches?(word)
-    Regexp.new(WordWatcher.word_to_regexp(word), Regexp::IGNORECASE).match?(@raw)
+    Regexp.new(WordWatcher.word_to_regexp(word, whole: true), Regexp::IGNORECASE).match?(@raw)
  end
 end
--- a/spec/components/post_creator_spec.rb
+++ b/spec/components/post_creator_spec.rb
@ -502,13 +502,21 @@ describe PostCreator do
            end

            context "without regular expressions" do
-              it "works" do
+              it "works with many tags" do
                Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "HELLO", replacement: "greetings , hey")

                @post = creator.create
                expect(@post.topic.tags.map(&:name)).to match_array(['greetings', 'hey'])
              end

+              it "works with overlapping words" do
+                Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "art", replacement: "about-art")
+                Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "artist*", replacement: "about-artists")
+
+                post = PostCreator.new(user, title: "hello world topic", raw: "this is topic abour artists", archetype_id: 1).create
+                expect(post.topic.tags.map(&:name)).to match_array(['about-artists'])
+              end
+
              it "does not treat as regular expressions" do
                Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "he(llo|y)", replacement: "greetings , hey")

--- a/spec/components/pretty_text_spec.rb
+++ b/spec/components/pretty_text_spec.rb
@ -1420,6 +1420,10 @@ HTML
      expect(PrettyText.cook("Lorem ipsum dolor sittt amet")).to match_html(<<~HTML)
        <p>Lorem ipsum something else amet</p>
      HTML
+
+      expect(PrettyText.cook("Lorem ipsum xdolor sit amet")).to match_html(<<~HTML)
+        <p>Lorem ipsum xdolor sit amet</p>
+      HTML
    end

    it "replaces words with links" do