From fd07c943adc92ade47ccc8f79894bc23bb5f5b8e Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Wed, 1 Nov 2023 16:41:10 +0200 Subject: [PATCH] DEV: Refactor watched words (#24163) - Ignore only invalid words, not all words if one of them is invalid - The naming scheme for methods was inconsistent - Optimize regular expressions --- .../admin/watched_words_controller.rb | 2 +- app/models/admin_dashboard_data.rb | 2 +- app/models/watched_word.rb | 123 ++++++++------- app/serializers/site_serializer.rb | 6 +- .../watched_word_list_serializer.rb | 2 +- app/services/word_watcher.rb | 149 ++++++++++-------- lib/new_post_manager.rb | 4 +- lib/pretty_text.rb | 6 +- lib/validators/censored_words_validator.rb | 4 +- .../censored_words_validator_spec.rb | 6 +- .../admin/watched_words_controller_spec.rb | 5 +- spec/services/word_watcher_spec.rb | 16 +- 12 files changed, 170 insertions(+), 155 deletions(-) diff --git a/app/controllers/admin/watched_words_controller.rb b/app/controllers/admin/watched_words_controller.rb index 2a5cbb496fd..16d81987334 100644 --- a/app/controllers/admin/watched_words_controller.rb +++ b/app/controllers/admin/watched_words_controller.rb @@ -6,7 +6,7 @@ class Admin::WatchedWordsController < Admin::StaffController skip_before_action :check_xhr, only: [:download] def index - watched_words = WatchedWord.by_action + watched_words = WatchedWord.order(:action, :word) watched_words = watched_words.where.not(action: WatchedWord.actions[:tag]) if !SiteSetting.tagging_enabled render_json_dump WatchedWordListSerializer.new(watched_words, scope: guardian, root: false) diff --git a/app/models/admin_dashboard_data.rb b/app/models/admin_dashboard_data.rb index 81e72faacae..d01589ae002 100644 --- a/app/models/admin_dashboard_data.rb +++ b/app/models/admin_dashboard_data.rb @@ -421,7 +421,7 @@ class AdminDashboardData def watched_words_check WatchedWord.actions.keys.each do |action| begin - WordWatcher.word_matcher_regexp_list(action, raise_errors: true) + WordWatcher.compiled_regexps_for_action(action, raise_errors: true) rescue RegexpError => e translated_action = I18n.t("admin_js.admin.watched_words.actions.#{action}") I18n.t( diff --git a/app/models/watched_word.rb b/app/models/watched_word.rb index 3e9b4d5b575..e03bd95fd20 100644 --- a/app/models/watched_word.rb +++ b/app/models/watched_word.rb @@ -1,6 +1,39 @@ # frozen_string_literal: true class WatchedWord < ActiveRecord::Base + MAX_WORDS_PER_ACTION = 2000 + + before_validation { self.word = WatchedWord.normalize_word(self.word) } + + before_validation do + if self.action == WatchedWord.actions[:link] && self.replacement !~ %r{\Ahttps?://} + self.replacement = + "#{Discourse.base_url}#{self.replacement&.starts_with?("/") ? "" : "/"}#{self.replacement}" + end + end + + validates :word, presence: true, uniqueness: true, length: { maximum: 100 } + validates :action, presence: true + validate :replacement_is_url, if: -> { action == WatchedWord.actions[:link] } + validate :replacement_is_tag_list, if: -> { action == WatchedWord.actions[:tag] } + + validates_each :word do |record, attr, val| + if WatchedWord.where(action: record.action).count >= MAX_WORDS_PER_ACTION + record.errors.add(:word, :too_many) + end + end + + after_save -> { WordWatcher.clear_cache! } + after_destroy -> { WordWatcher.clear_cache! } + + scope :for, + ->(word:) { + where( + "(word ILIKE :word AND case_sensitive = 'f') OR (word LIKE :word AND case_sensitive = 't')", + word: word, + ) + } + def self.actions @actions ||= Enum.new( @@ -15,65 +48,15 @@ class WatchedWord < ActiveRecord::Base ) end - MAX_WORDS_PER_ACTION = 2000 - - before_validation do - self.word = self.class.normalize_word(self.word) - if self.action == WatchedWord.actions[:link] && !(self.replacement =~ %r{\Ahttps?://}) - self.replacement = - "#{Discourse.base_url}#{self.replacement&.starts_with?("/") ? "" : "/"}#{self.replacement}" - end - end - - validates :word, presence: true, uniqueness: true, length: { maximum: 100 } - validates :action, presence: true - - validate :replacement_is_url, if: -> { action == WatchedWord.actions[:link] } - validate :replacement_is_tag_list, if: -> { action == WatchedWord.actions[:tag] } - - validates_each :word do |record, attr, val| - if WatchedWord.where(action: record.action).count >= MAX_WORDS_PER_ACTION - record.errors.add(:word, :too_many) - end - end - - after_save :clear_cache - after_destroy :clear_cache - - scope :by_action, -> { order("action ASC, word ASC") } - scope :for, - ->(word:) { - where( - "(word ILIKE :word AND case_sensitive = 'f') OR (word LIKE :word AND case_sensitive = 't')", - word: word, - ) - } - - def self.normalize_word(w) - w.strip.squeeze("*") - end - - def replacement_is_url - errors.add(:base, :invalid_url) if !(replacement =~ URI.regexp) - end - - def replacement_is_tag_list - tag_list = replacement&.split(",") - tags = Tag.where(name: tag_list) - if (tag_list.blank? || tags.empty? || tag_list.size != tags.size) - errors.add(:base, :invalid_tag_list) - end - end - def self.create_or_update_word(params) - new_word = normalize_word(params[:word]) - w = self.for(word: new_word).first_or_initialize(word: new_word) - w.replacement = params[:replacement] if params[:replacement] - w.action_key = params[:action_key] if params[:action_key] - w.action = params[:action] if params[:action] - w.case_sensitive = params[:case_sensitive] if !params[:case_sensitive].nil? - w.save - w + word = normalize_word(params[:word]) + word = self.for(word: word).first_or_initialize(word: word) + word.replacement = params[:replacement] if params[:replacement] + word.action_key = params[:action_key] if params[:action_key] + word.action = params[:action] if params[:action] + word.case_sensitive = params[:case_sensitive] if !params[:case_sensitive].nil? + word.save + word end def self.has_replacement?(action) @@ -81,7 +64,7 @@ class WatchedWord < ActiveRecord::Base end def action_key=(arg) - self.action = self.class.actions[arg.to_sym] + self.action = WatchedWord.actions[arg.to_sym] end def action_log_details @@ -92,8 +75,26 @@ class WatchedWord < ActiveRecord::Base end end - def clear_cache - WordWatcher.clear_cache! + private + + def self.normalize_word(word) + # When a regular expression is converted to a string, it is wrapped with + # '(?-mix:' and ')' + word = word[7..-2] if word.start_with?("(?-mix:") + + word.strip.squeeze("*") + end + + def replacement_is_url + errors.add(:base, :invalid_url) if replacement !~ URI.regexp + end + + def replacement_is_tag_list + tag_list = replacement&.split(",") + tags = Tag.where(name: tag_list) + if tag_list.blank? || tags.empty? || tag_list.size != tags.size + errors.add(:base, :invalid_tag_list) + end end end diff --git a/app/serializers/site_serializer.rb b/app/serializers/site_serializer.rb index 9bbe67f4271..8693cd5d040 100644 --- a/app/serializers/site_serializer.rb +++ b/app/serializers/site_serializer.rb @@ -210,7 +210,7 @@ class SiteSerializer < ApplicationSerializer end def censored_regexp - WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js) + WordWatcher.serialized_regexps_for_action(:censor, engine: :js) end def custom_emoji_translation @@ -226,11 +226,11 @@ class SiteSerializer < ApplicationSerializer end def watched_words_replace - WordWatcher.word_matcher_regexps(:replace, engine: :js) + WordWatcher.regexps_for_action(:replace, engine: :js) end def watched_words_link - WordWatcher.word_matcher_regexps(:link, engine: :js) + WordWatcher.regexps_for_action(:link, engine: :js) end def categories diff --git a/app/serializers/watched_word_list_serializer.rb b/app/serializers/watched_word_list_serializer.rb index 9d0929a9fa9..42f89f326a6 100644 --- a/app/serializers/watched_word_list_serializer.rb +++ b/app/serializers/watched_word_list_serializer.rb @@ -18,7 +18,7 @@ class WatchedWordListSerializer < ApplicationSerializer def compiled_regular_expressions expressions = {} actions.each do |action| - expressions[action] = WordWatcher.serializable_word_matcher_regexp(action, engine: :js) + expressions[action] = WordWatcher.serialized_regexps_for_action(action, engine: :js) end expressions end diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb index f37f48fe2d1..442c7fca29c 100644 --- a/app/services/word_watcher.rb +++ b/app/services/word_watcher.rb @@ -18,106 +18,109 @@ class WordWatcher @cache_enabled end + def self.cache_key(action) + "watched-words-list:v#{CACHE_VERSION}:#{action}" + end + + def self.clear_cache! + WatchedWord.actions.each { |action, _| Discourse.cache.delete(cache_key(action)) } + end + def self.words_for_action(action) WatchedWord .where(action: WatchedWord.actions[action.to_sym]) .limit(WatchedWord::MAX_WORDS_PER_ACTION) .order(:id) .pluck(:word, :replacement, :case_sensitive) - .to_h do |w, r, c| - [w, { word: word_to_regexp(w, whole: false), replacement: r, case_sensitive: c }.compact] - end + .to_h { |w, r, c| [w, { word: w, replacement: r, case_sensitive: c }.compact] } end - def self.words_for_action_exists?(action) + def self.words_for_action_exist?(action) WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists? end - def self.get_cached_words(action) + def self.cached_words_for_action(action) if cache_enabled? Discourse .cache - .fetch(word_matcher_regexp_key(action), expires_in: 1.day) do - words_for_action(action).presence - end + .fetch(cache_key(action), expires_in: 1.day) { words_for_action(action).presence } else words_for_action(action).presence end end - def self.serializable_word_matcher_regexp(action, engine: :ruby) - word_matcher_regexp_list(action, engine: engine).map do |r| - { r.source => { case_sensitive: !r.casefold? } } + def self.regexps_for_action(action, engine: :ruby) + cached_words_for_action(action)&.to_h do |word, attrs| + [word_to_regexp(word, engine: engine), attrs] end end # This regexp is run in miniracer, and the client JS app # Make sure it is compatible with major browsers when changing # hint: non-chrome browsers do not support 'lookbehind' - def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false) - words = get_cached_words(action) + def self.compiled_regexps_for_action(action, engine: :ruby, raise_errors: false) + words = cached_words_for_action(action) return [] if words.blank? - grouped_words = { case_sensitive: [], case_insensitive: [] } + words + .values + .group_by { |attrs| attrs[:case_sensitive] ? :case_sensitive : :case_insensitive } + .map do |group_key, attrs_list| + words = attrs_list.map { |attrs| attrs[:word] } - words.each do |word, attrs| - word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?) - group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive - grouped_words[group_key] << word - end + # Compile all watched words into a single regular expression + regexp = + words + .map do |word| + r = word_to_regexp(word, match_word: SiteSetting.watched_words_regular_expressions?) + begin + r if Regexp.new(r) + rescue RegexpError + raise if raise_errors + end + end + .select { |r| r.present? } + .join("|") - regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") } + # Add word boundaries to the regexp for regular watched words + regexp = + match_word_regexp( + regexp, + engine: engine, + ) if !SiteSetting.watched_words_regular_expressions? - if !SiteSetting.watched_words_regular_expressions? - regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) } - end - - regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) } - rescue RegexpError - raise if raise_errors - [] # Admin will be alerted via admin_dashboard_data.rb + # Add case insensitive flag if needed + Regexp.new(regexp, group_key == :case_sensitive ? nil : Regexp::IGNORECASE) + end end - def self.word_matcher_regexps(action, engine: :ruby) - get_cached_words(action)&.to_h { |word, attrs| [word_to_regexp(word, engine: engine), attrs] } + def self.serialized_regexps_for_action(action, engine: :ruby) + compiled_regexps_for_action(action, engine: engine).map do |r| + { r.source => { case_sensitive: !r.casefold? } } + end end - def self.word_to_regexp(word, engine: :ruby, whole: true) + def self.word_to_regexp(word, engine: :ruby, match_word: true) if SiteSetting.watched_words_regular_expressions? - # Strip Ruby regexp format if present - regexp = word.start_with?("(?-mix:") ? word[7..-2] : word - regexp = "(#{regexp})" if whole - return regexp - end - - # Escape regular expression. Avoid using Regexp.escape because it escapes - # more characters than it should (for example, whitespaces) - regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1') - - # Handle wildcards - regexp = regexp.gsub("\\*", '\S*') - - regexp = wrap_regexp(regexp, engine: engine) if whole - - regexp - end - - def self.wrap_regexp(regexp, engine: :ruby) - if engine == :js - "(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)" - elsif engine == :ruby - "(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)" + regexp = word + regexp = "(#{regexp})" if match_word + regexp else - "(?:\\W|^)(#{regexp})(?=\\W|$)" - end - end + # Convert word to regex by escaping special characters in a regexp. + # Avoid using Regexp.escape because it escapes more characters than + # it should (for example, whitespaces, dashes, etc) + regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1') - def self.word_matcher_regexp_key(action) - "watched-words-list:v#{CACHE_VERSION}:#{action}" + # Convert wildcards to regexp + regexp = regexp.gsub("\\*", '\S*') + + regexp = match_word_regexp(regexp, engine: engine) if match_word + regexp + end end def self.censor(html) - regexps = word_matcher_regexp_list(:censor) + regexps = compiled_regexps_for_action(:censor) return html if regexps.blank? doc = Nokogiri::HTML5.fragment(html) @@ -133,7 +136,7 @@ class WordWatcher def self.censor_text(text) return text if text.blank? - regexps = word_matcher_regexp_list(:censor) + regexps = compiled_regexps_for_action(:censor) return text if regexps.blank? regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) } @@ -156,10 +159,6 @@ class WordWatcher text end - def self.clear_cache! - WatchedWord.actions.each { |a, i| Discourse.cache.delete word_matcher_regexp_key(a) } - end - def requires_approval? word_matches_for_action?(:require_approval) end @@ -177,7 +176,7 @@ class WordWatcher end def word_matches_for_action?(action, all_matches: false) - regexps = self.class.word_matcher_regexp_list(action) + regexps = self.class.compiled_regexps_for_action(action) return if regexps.blank? match_list = [] @@ -254,14 +253,28 @@ class WordWatcher private_class_method :censor_text_with_regexp - private + # Returns a regexp that transforms a regular expression into a regular + # expression that matches a whole word. + def self.match_word_regexp(regexp, engine: :ruby) + if engine == :js + "(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)" + elsif engine == :ruby + "(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)" + else + raise "unknown regexp engine: #{engine}" + end + end + + private_class_method :match_word_regexp def self.replace(text, watch_word_type) - word_matcher_regexps(watch_word_type) + regexps_for_action(watch_word_type) .to_a .reduce(text) do |t, (word_regexp, attrs)| case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement]) end end + + private_class_method :replace end diff --git a/lib/new_post_manager.rb b/lib/new_post_manager.rb index 80338633480..970cb943b92 100644 --- a/lib/new_post_manager.rb +++ b/lib/new_post_manager.rb @@ -202,8 +202,8 @@ class NewPostManager def self.queue_enabled? SiteSetting.approve_post_count > 0 || SiteSetting.approve_unless_trust_level.to_i > 0 || SiteSetting.approve_new_topics_unless_trust_level.to_i > 0 || - SiteSetting.approve_unless_staged || - WordWatcher.words_for_action_exists?(:require_approval) || handlers.size > 1 + SiteSetting.approve_unless_staged || WordWatcher.words_for_action_exist?(:require_approval) || + handlers.size > 1 end def initialize(user, args) diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb index 3b283bc01a4..85927fc48ba 100644 --- a/lib/pretty_text.rb +++ b/lib/pretty_text.rb @@ -203,9 +203,9 @@ module PrettyText __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer; __optInput.emojiDenyList = #{Emoji.denied.to_json}; __optInput.lookupUploadUrls = __lookupUploadUrls; - __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json}; - __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json}; - __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json}; + __optInput.censoredRegexp = #{WordWatcher.serialized_regexps_for_action(:censor, engine: :js).to_json}; + __optInput.watchedWordsReplace = #{WordWatcher.regexps_for_action(:replace, engine: :js).to_json}; + __optInput.watchedWordsLink = #{WordWatcher.regexps_for_action(:link, engine: :js).to_json}; __optInput.additionalOptions = #{Site.markdown_additional_options.to_json}; __optInput.avatar_sizes = #{SiteSetting.avatar_sizes.to_json}; JS diff --git a/lib/validators/censored_words_validator.rb b/lib/validators/censored_words_validator.rb index e69b7f77806..b1a61d694b6 100644 --- a/lib/validators/censored_words_validator.rb +++ b/lib/validators/censored_words_validator.rb @@ -2,8 +2,8 @@ class CensoredWordsValidator < ActiveModel::EachValidator def validate_each(record, attribute, value) - words_regexps = WordWatcher.word_matcher_regexp_list(:censor) - if WordWatcher.words_for_action_exists?(:censor).present? && words_regexps.present? + words_regexps = WordWatcher.compiled_regexps_for_action(:censor) + if WordWatcher.words_for_action_exist?(:censor).present? && words_regexps.present? censored_words = censor_words(value, words_regexps) return if censored_words.blank? diff --git a/spec/lib/validators/censored_words_validator_spec.rb b/spec/lib/validators/censored_words_validator_spec.rb index 89423466944..924553c3595 100644 --- a/spec/lib/validators/censored_words_validator_spec.rb +++ b/spec/lib/validators/censored_words_validator_spec.rb @@ -11,8 +11,8 @@ RSpec.describe CensoredWordsValidator do Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad") end - context "when word_matcher_regexp_list is empty" do - before { WordWatcher.stubs(:word_matcher_regexp_list).returns([]) } + context "when compiled_regexps_for_action is empty" do + before { WordWatcher.stubs(:compiled_regexps_for_action).returns([]) } it "adds no errors to the record" do validate @@ -20,7 +20,7 @@ RSpec.describe CensoredWordsValidator do end end - context "when word_matcher_regexp_list is not empty" do + context "when compiled_regexps_for_action is not empty" do context "when the new value does not contain the watched word" do let(:value) { "some new good text" } diff --git a/spec/requests/admin/watched_words_controller_spec.rb b/spec/requests/admin/watched_words_controller_spec.rb index 12677fbc655..d30c9a12ae6 100644 --- a/spec/requests/admin/watched_words_controller_spec.rb +++ b/spec/requests/admin/watched_words_controller_spec.rb @@ -51,10 +51,7 @@ RSpec.describe Admin::WatchedWordsController do ), ) expect(watched_words["compiled_regular_expressions"]["block"].first).to eq( - WordWatcher - .serializable_word_matcher_regexp(:block, engine: :js) - .first - .deep_stringify_keys, + WordWatcher.serialized_regexps_for_action(:block, engine: :js).first.deep_stringify_keys, ) end end diff --git a/spec/services/word_watcher_spec.rb b/spec/services/word_watcher_spec.rb index 7749e7964c1..a2911854ec4 100644 --- a/spec/services/word_watcher_spec.rb +++ b/spec/services/word_watcher_spec.rb @@ -52,7 +52,7 @@ RSpec.describe WordWatcher do end end - describe ".word_matcher_regexp_list" do + describe ".compiled_regexps_for_action" do let!(:word1) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word } let!(:word2) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word } let!(:word3) do @@ -65,7 +65,7 @@ RSpec.describe WordWatcher do context "when watched_words_regular_expressions = true" do it "returns the proper regexp" do SiteSetting.watched_words_regular_expressions = true - regexps = described_class.word_matcher_regexp_list(:block) + regexps = described_class.compiled_regexps_for_action(:block) expect(regexps).to be_an(Array) expect(regexps.map(&:inspect)).to contain_exactly( @@ -78,7 +78,7 @@ RSpec.describe WordWatcher do context "when watched_words_regular_expressions = false" do it "returns the proper regexp" do SiteSetting.watched_words_regular_expressions = false - regexps = described_class.word_matcher_regexp_list(:block) + regexps = described_class.compiled_regexps_for_action(:block) expect(regexps).to be_an(Array) expect(regexps.map(&:inspect)).to contain_exactly( @@ -88,7 +88,7 @@ RSpec.describe WordWatcher do end it "is empty for an action without watched words" do - regexps = described_class.word_matcher_regexp_list(:censor) + regexps = described_class.compiled_regexps_for_action(:censor) expect(regexps).to be_an(Array) expect(regexps).to be_empty @@ -102,12 +102,16 @@ RSpec.describe WordWatcher do end it "does not raise an exception by default" do - expect { described_class.word_matcher_regexp_list(:block) }.not_to raise_error + expect { described_class.compiled_regexps_for_action(:block) }.not_to raise_error + expect(described_class.compiled_regexps_for_action(:block)).to contain_exactly( + /(#{word1})|(#{word2})/i, + /(#{word3})|(#{word4})/, + ) end it "raises an exception with raise_errors set to true" do expect { - described_class.word_matcher_regexp_list(:block, raise_errors: true) + described_class.compiled_regexps_for_action(:block, raise_errors: true) }.to raise_error(RegexpError) end end