DEV: Refactor watched words (#24163)

- Ignore only invalid words, not all words if one of them is invalid

- The naming scheme for methods was inconsistent

- Optimize regular expressions
This commit is contained in:
Bianca Nenciu 2023-11-01 16:41:10 +02:00 committed by GitHub
parent a32fce9e1d
commit fd07c943ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 170 additions and 155 deletions

View File

@ -6,7 +6,7 @@ class Admin::WatchedWordsController < Admin::StaffController
skip_before_action :check_xhr, only: [:download] skip_before_action :check_xhr, only: [:download]
def index def index
watched_words = WatchedWord.by_action watched_words = WatchedWord.order(:action, :word)
watched_words = watched_words =
watched_words.where.not(action: WatchedWord.actions[:tag]) if !SiteSetting.tagging_enabled watched_words.where.not(action: WatchedWord.actions[:tag]) if !SiteSetting.tagging_enabled
render_json_dump WatchedWordListSerializer.new(watched_words, scope: guardian, root: false) render_json_dump WatchedWordListSerializer.new(watched_words, scope: guardian, root: false)

View File

@ -421,7 +421,7 @@ class AdminDashboardData
def watched_words_check def watched_words_check
WatchedWord.actions.keys.each do |action| WatchedWord.actions.keys.each do |action|
begin begin
WordWatcher.word_matcher_regexp_list(action, raise_errors: true) WordWatcher.compiled_regexps_for_action(action, raise_errors: true)
rescue RegexpError => e rescue RegexpError => e
translated_action = I18n.t("admin_js.admin.watched_words.actions.#{action}") translated_action = I18n.t("admin_js.admin.watched_words.actions.#{action}")
I18n.t( I18n.t(

View File

@ -1,6 +1,39 @@
# frozen_string_literal: true # frozen_string_literal: true
class WatchedWord < ActiveRecord::Base class WatchedWord < ActiveRecord::Base
MAX_WORDS_PER_ACTION = 2000
before_validation { self.word = WatchedWord.normalize_word(self.word) }
before_validation do
if self.action == WatchedWord.actions[:link] && self.replacement !~ %r{\Ahttps?://}
self.replacement =
"#{Discourse.base_url}#{self.replacement&.starts_with?("/") ? "" : "/"}#{self.replacement}"
end
end
validates :word, presence: true, uniqueness: true, length: { maximum: 100 }
validates :action, presence: true
validate :replacement_is_url, if: -> { action == WatchedWord.actions[:link] }
validate :replacement_is_tag_list, if: -> { action == WatchedWord.actions[:tag] }
validates_each :word do |record, attr, val|
if WatchedWord.where(action: record.action).count >= MAX_WORDS_PER_ACTION
record.errors.add(:word, :too_many)
end
end
after_save -> { WordWatcher.clear_cache! }
after_destroy -> { WordWatcher.clear_cache! }
scope :for,
->(word:) {
where(
"(word ILIKE :word AND case_sensitive = 'f') OR (word LIKE :word AND case_sensitive = 't')",
word: word,
)
}
def self.actions def self.actions
@actions ||= @actions ||=
Enum.new( Enum.new(
@ -15,65 +48,15 @@ class WatchedWord < ActiveRecord::Base
) )
end end
MAX_WORDS_PER_ACTION = 2000
before_validation do
self.word = self.class.normalize_word(self.word)
if self.action == WatchedWord.actions[:link] && !(self.replacement =~ %r{\Ahttps?://})
self.replacement =
"#{Discourse.base_url}#{self.replacement&.starts_with?("/") ? "" : "/"}#{self.replacement}"
end
end
validates :word, presence: true, uniqueness: true, length: { maximum: 100 }
validates :action, presence: true
validate :replacement_is_url, if: -> { action == WatchedWord.actions[:link] }
validate :replacement_is_tag_list, if: -> { action == WatchedWord.actions[:tag] }
validates_each :word do |record, attr, val|
if WatchedWord.where(action: record.action).count >= MAX_WORDS_PER_ACTION
record.errors.add(:word, :too_many)
end
end
after_save :clear_cache
after_destroy :clear_cache
scope :by_action, -> { order("action ASC, word ASC") }
scope :for,
->(word:) {
where(
"(word ILIKE :word AND case_sensitive = 'f') OR (word LIKE :word AND case_sensitive = 't')",
word: word,
)
}
def self.normalize_word(w)
w.strip.squeeze("*")
end
def replacement_is_url
errors.add(:base, :invalid_url) if !(replacement =~ URI.regexp)
end
def replacement_is_tag_list
tag_list = replacement&.split(",")
tags = Tag.where(name: tag_list)
if (tag_list.blank? || tags.empty? || tag_list.size != tags.size)
errors.add(:base, :invalid_tag_list)
end
end
def self.create_or_update_word(params) def self.create_or_update_word(params)
new_word = normalize_word(params[:word]) word = normalize_word(params[:word])
w = self.for(word: new_word).first_or_initialize(word: new_word) word = self.for(word: word).first_or_initialize(word: word)
w.replacement = params[:replacement] if params[:replacement] word.replacement = params[:replacement] if params[:replacement]
w.action_key = params[:action_key] if params[:action_key] word.action_key = params[:action_key] if params[:action_key]
w.action = params[:action] if params[:action] word.action = params[:action] if params[:action]
w.case_sensitive = params[:case_sensitive] if !params[:case_sensitive].nil? word.case_sensitive = params[:case_sensitive] if !params[:case_sensitive].nil?
w.save word.save
w word
end end
def self.has_replacement?(action) def self.has_replacement?(action)
@ -81,7 +64,7 @@ class WatchedWord < ActiveRecord::Base
end end
def action_key=(arg) def action_key=(arg)
self.action = self.class.actions[arg.to_sym] self.action = WatchedWord.actions[arg.to_sym]
end end
def action_log_details def action_log_details
@ -92,8 +75,26 @@ class WatchedWord < ActiveRecord::Base
end end
end end
def clear_cache private
WordWatcher.clear_cache!
def self.normalize_word(word)
# When a regular expression is converted to a string, it is wrapped with
# '(?-mix:' and ')'
word = word[7..-2] if word.start_with?("(?-mix:")
word.strip.squeeze("*")
end
def replacement_is_url
errors.add(:base, :invalid_url) if replacement !~ URI.regexp
end
def replacement_is_tag_list
tag_list = replacement&.split(",")
tags = Tag.where(name: tag_list)
if tag_list.blank? || tags.empty? || tag_list.size != tags.size
errors.add(:base, :invalid_tag_list)
end
end end
end end

View File

@ -210,7 +210,7 @@ class SiteSerializer < ApplicationSerializer
end end
def censored_regexp def censored_regexp
WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js) WordWatcher.serialized_regexps_for_action(:censor, engine: :js)
end end
def custom_emoji_translation def custom_emoji_translation
@ -226,11 +226,11 @@ class SiteSerializer < ApplicationSerializer
end end
def watched_words_replace def watched_words_replace
WordWatcher.word_matcher_regexps(:replace, engine: :js) WordWatcher.regexps_for_action(:replace, engine: :js)
end end
def watched_words_link def watched_words_link
WordWatcher.word_matcher_regexps(:link, engine: :js) WordWatcher.regexps_for_action(:link, engine: :js)
end end
def categories def categories

View File

@ -18,7 +18,7 @@ class WatchedWordListSerializer < ApplicationSerializer
def compiled_regular_expressions def compiled_regular_expressions
expressions = {} expressions = {}
actions.each do |action| actions.each do |action|
expressions[action] = WordWatcher.serializable_word_matcher_regexp(action, engine: :js) expressions[action] = WordWatcher.serialized_regexps_for_action(action, engine: :js)
end end
expressions expressions
end end

View File

@ -18,106 +18,109 @@ class WordWatcher
@cache_enabled @cache_enabled
end end
def self.cache_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
def self.clear_cache!
WatchedWord.actions.each { |action, _| Discourse.cache.delete(cache_key(action)) }
end
def self.words_for_action(action) def self.words_for_action(action)
WatchedWord WatchedWord
.where(action: WatchedWord.actions[action.to_sym]) .where(action: WatchedWord.actions[action.to_sym])
.limit(WatchedWord::MAX_WORDS_PER_ACTION) .limit(WatchedWord::MAX_WORDS_PER_ACTION)
.order(:id) .order(:id)
.pluck(:word, :replacement, :case_sensitive) .pluck(:word, :replacement, :case_sensitive)
.to_h do |w, r, c| .to_h { |w, r, c| [w, { word: w, replacement: r, case_sensitive: c }.compact] }
[w, { word: word_to_regexp(w, whole: false), replacement: r, case_sensitive: c }.compact]
end
end end
def self.words_for_action_exists?(action) def self.words_for_action_exist?(action)
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists? WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
end end
def self.get_cached_words(action) def self.cached_words_for_action(action)
if cache_enabled? if cache_enabled?
Discourse Discourse
.cache .cache
.fetch(word_matcher_regexp_key(action), expires_in: 1.day) do .fetch(cache_key(action), expires_in: 1.day) { words_for_action(action).presence }
words_for_action(action).presence
end
else else
words_for_action(action).presence words_for_action(action).presence
end end
end end
def self.serializable_word_matcher_regexp(action, engine: :ruby) def self.regexps_for_action(action, engine: :ruby)
word_matcher_regexp_list(action, engine: engine).map do |r| cached_words_for_action(action)&.to_h do |word, attrs|
{ r.source => { case_sensitive: !r.casefold? } } [word_to_regexp(word, engine: engine), attrs]
end end
end end
# This regexp is run in miniracer, and the client JS app # This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing # Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind' # hint: non-chrome browsers do not support 'lookbehind'
def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false) def self.compiled_regexps_for_action(action, engine: :ruby, raise_errors: false)
words = get_cached_words(action) words = cached_words_for_action(action)
return [] if words.blank? return [] if words.blank?
grouped_words = { case_sensitive: [], case_insensitive: [] } words
.values
.group_by { |attrs| attrs[:case_sensitive] ? :case_sensitive : :case_insensitive }
.map do |group_key, attrs_list|
words = attrs_list.map { |attrs| attrs[:word] }
words.each do |word, attrs| # Compile all watched words into a single regular expression
word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?) regexp =
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive words
grouped_words[group_key] << word .map do |word|
end r = word_to_regexp(word, match_word: SiteSetting.watched_words_regular_expressions?)
begin
regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") } r if Regexp.new(r)
if !SiteSetting.watched_words_regular_expressions?
regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
end
regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
rescue RegexpError rescue RegexpError
raise if raise_errors raise if raise_errors
[] # Admin will be alerted via admin_dashboard_data.rb end
end
.select { |r| r.present? }
.join("|")
# Add word boundaries to the regexp for regular watched words
regexp =
match_word_regexp(
regexp,
engine: engine,
) if !SiteSetting.watched_words_regular_expressions?
# Add case insensitive flag if needed
Regexp.new(regexp, group_key == :case_sensitive ? nil : Regexp::IGNORECASE)
end
end end
def self.word_matcher_regexps(action, engine: :ruby) def self.serialized_regexps_for_action(action, engine: :ruby)
get_cached_words(action)&.to_h { |word, attrs| [word_to_regexp(word, engine: engine), attrs] } compiled_regexps_for_action(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end end
def self.word_to_regexp(word, engine: :ruby, whole: true) def self.word_to_regexp(word, engine: :ruby, match_word: true)
if SiteSetting.watched_words_regular_expressions? if SiteSetting.watched_words_regular_expressions?
# Strip Ruby regexp format if present regexp = word
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word regexp = "(#{regexp})" if match_word
regexp = "(#{regexp})" if whole regexp
return regexp else
end # Convert word to regex by escaping special characters in a regexp.
# Avoid using Regexp.escape because it escapes more characters than
# Escape regular expression. Avoid using Regexp.escape because it escapes # it should (for example, whitespaces, dashes, etc)
# more characters than it should (for example, whitespaces)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1') regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
# Handle wildcards # Convert wildcards to regexp
regexp = regexp.gsub("\\*", '\S*') regexp = regexp.gsub("\\*", '\S*')
regexp = wrap_regexp(regexp, engine: engine) if whole regexp = match_word_regexp(regexp, engine: engine) if match_word
regexp regexp
end end
def self.wrap_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
"(?:\\W|^)(#{regexp})(?=\\W|$)"
end
end
def self.word_matcher_regexp_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end end
def self.censor(html) def self.censor(html)
regexps = word_matcher_regexp_list(:censor) regexps = compiled_regexps_for_action(:censor)
return html if regexps.blank? return html if regexps.blank?
doc = Nokogiri::HTML5.fragment(html) doc = Nokogiri::HTML5.fragment(html)
@ -133,7 +136,7 @@ class WordWatcher
def self.censor_text(text) def self.censor_text(text)
return text if text.blank? return text if text.blank?
regexps = word_matcher_regexp_list(:censor) regexps = compiled_regexps_for_action(:censor)
return text if regexps.blank? return text if regexps.blank?
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) } regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
@ -156,10 +159,6 @@ class WordWatcher
text text
end end
def self.clear_cache!
WatchedWord.actions.each { |a, i| Discourse.cache.delete word_matcher_regexp_key(a) }
end
def requires_approval? def requires_approval?
word_matches_for_action?(:require_approval) word_matches_for_action?(:require_approval)
end end
@ -177,7 +176,7 @@ class WordWatcher
end end
def word_matches_for_action?(action, all_matches: false) def word_matches_for_action?(action, all_matches: false)
regexps = self.class.word_matcher_regexp_list(action) regexps = self.class.compiled_regexps_for_action(action)
return if regexps.blank? return if regexps.blank?
match_list = [] match_list = []
@ -254,14 +253,28 @@ class WordWatcher
private_class_method :censor_text_with_regexp private_class_method :censor_text_with_regexp
private # Returns a regexp that transforms a regular expression into a regular
# expression that matches a whole word.
def self.match_word_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
raise "unknown regexp engine: #{engine}"
end
end
private_class_method :match_word_regexp
def self.replace(text, watch_word_type) def self.replace(text, watch_word_type)
word_matcher_regexps(watch_word_type) regexps_for_action(watch_word_type)
.to_a .to_a
.reduce(text) do |t, (word_regexp, attrs)| .reduce(text) do |t, (word_regexp, attrs)|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement]) replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
end end
end end
private_class_method :replace
end end

View File

@ -202,8 +202,8 @@ class NewPostManager
def self.queue_enabled? def self.queue_enabled?
SiteSetting.approve_post_count > 0 || SiteSetting.approve_unless_trust_level.to_i > 0 || SiteSetting.approve_post_count > 0 || SiteSetting.approve_unless_trust_level.to_i > 0 ||
SiteSetting.approve_new_topics_unless_trust_level.to_i > 0 || SiteSetting.approve_new_topics_unless_trust_level.to_i > 0 ||
SiteSetting.approve_unless_staged || SiteSetting.approve_unless_staged || WordWatcher.words_for_action_exist?(:require_approval) ||
WordWatcher.words_for_action_exists?(:require_approval) || handlers.size > 1 handlers.size > 1
end end
def initialize(user, args) def initialize(user, args)

View File

@ -203,9 +203,9 @@ module PrettyText
__optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer; __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
__optInput.emojiDenyList = #{Emoji.denied.to_json}; __optInput.emojiDenyList = #{Emoji.denied.to_json};
__optInput.lookupUploadUrls = __lookupUploadUrls; __optInput.lookupUploadUrls = __lookupUploadUrls;
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json}; __optInput.censoredRegexp = #{WordWatcher.serialized_regexps_for_action(:censor, engine: :js).to_json};
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json}; __optInput.watchedWordsReplace = #{WordWatcher.regexps_for_action(:replace, engine: :js).to_json};
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json}; __optInput.watchedWordsLink = #{WordWatcher.regexps_for_action(:link, engine: :js).to_json};
__optInput.additionalOptions = #{Site.markdown_additional_options.to_json}; __optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
__optInput.avatar_sizes = #{SiteSetting.avatar_sizes.to_json}; __optInput.avatar_sizes = #{SiteSetting.avatar_sizes.to_json};
JS JS

View File

@ -2,8 +2,8 @@
class CensoredWordsValidator < ActiveModel::EachValidator class CensoredWordsValidator < ActiveModel::EachValidator
def validate_each(record, attribute, value) def validate_each(record, attribute, value)
words_regexps = WordWatcher.word_matcher_regexp_list(:censor) words_regexps = WordWatcher.compiled_regexps_for_action(:censor)
if WordWatcher.words_for_action_exists?(:censor).present? && words_regexps.present? if WordWatcher.words_for_action_exist?(:censor).present? && words_regexps.present?
censored_words = censor_words(value, words_regexps) censored_words = censor_words(value, words_regexps)
return if censored_words.blank? return if censored_words.blank?

View File

@ -11,8 +11,8 @@ RSpec.describe CensoredWordsValidator do
Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad") Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad")
end end
context "when word_matcher_regexp_list is empty" do context "when compiled_regexps_for_action is empty" do
before { WordWatcher.stubs(:word_matcher_regexp_list).returns([]) } before { WordWatcher.stubs(:compiled_regexps_for_action).returns([]) }
it "adds no errors to the record" do it "adds no errors to the record" do
validate validate
@ -20,7 +20,7 @@ RSpec.describe CensoredWordsValidator do
end end
end end
context "when word_matcher_regexp_list is not empty" do context "when compiled_regexps_for_action is not empty" do
context "when the new value does not contain the watched word" do context "when the new value does not contain the watched word" do
let(:value) { "some new good text" } let(:value) { "some new good text" }

View File

@ -51,10 +51,7 @@ RSpec.describe Admin::WatchedWordsController do
), ),
) )
expect(watched_words["compiled_regular_expressions"]["block"].first).to eq( expect(watched_words["compiled_regular_expressions"]["block"].first).to eq(
WordWatcher WordWatcher.serialized_regexps_for_action(:block, engine: :js).first.deep_stringify_keys,
.serializable_word_matcher_regexp(:block, engine: :js)
.first
.deep_stringify_keys,
) )
end end
end end

View File

@ -52,7 +52,7 @@ RSpec.describe WordWatcher do
end end
end end
describe ".word_matcher_regexp_list" do describe ".compiled_regexps_for_action" do
let!(:word1) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word } let!(:word1) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word }
let!(:word2) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word } let!(:word2) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word }
let!(:word3) do let!(:word3) do
@ -65,7 +65,7 @@ RSpec.describe WordWatcher do
context "when watched_words_regular_expressions = true" do context "when watched_words_regular_expressions = true" do
it "returns the proper regexp" do it "returns the proper regexp" do
SiteSetting.watched_words_regular_expressions = true SiteSetting.watched_words_regular_expressions = true
regexps = described_class.word_matcher_regexp_list(:block) regexps = described_class.compiled_regexps_for_action(:block)
expect(regexps).to be_an(Array) expect(regexps).to be_an(Array)
expect(regexps.map(&:inspect)).to contain_exactly( expect(regexps.map(&:inspect)).to contain_exactly(
@ -78,7 +78,7 @@ RSpec.describe WordWatcher do
context "when watched_words_regular_expressions = false" do context "when watched_words_regular_expressions = false" do
it "returns the proper regexp" do it "returns the proper regexp" do
SiteSetting.watched_words_regular_expressions = false SiteSetting.watched_words_regular_expressions = false
regexps = described_class.word_matcher_regexp_list(:block) regexps = described_class.compiled_regexps_for_action(:block)
expect(regexps).to be_an(Array) expect(regexps).to be_an(Array)
expect(regexps.map(&:inspect)).to contain_exactly( expect(regexps.map(&:inspect)).to contain_exactly(
@ -88,7 +88,7 @@ RSpec.describe WordWatcher do
end end
it "is empty for an action without watched words" do it "is empty for an action without watched words" do
regexps = described_class.word_matcher_regexp_list(:censor) regexps = described_class.compiled_regexps_for_action(:censor)
expect(regexps).to be_an(Array) expect(regexps).to be_an(Array)
expect(regexps).to be_empty expect(regexps).to be_empty
@ -102,12 +102,16 @@ RSpec.describe WordWatcher do
end end
it "does not raise an exception by default" do it "does not raise an exception by default" do
expect { described_class.word_matcher_regexp_list(:block) }.not_to raise_error expect { described_class.compiled_regexps_for_action(:block) }.not_to raise_error
expect(described_class.compiled_regexps_for_action(:block)).to contain_exactly(
/(#{word1})|(#{word2})/i,
/(#{word3})|(#{word4})/,
)
end end
it "raises an exception with raise_errors set to true" do it "raises an exception with raise_errors set to true" do
expect { expect {
described_class.word_matcher_regexp_list(:block, raise_errors: true) described_class.compiled_regexps_for_action(:block, raise_errors: true)
}.to raise_error(RegexpError) }.to raise_error(RegexpError)
end end
end end