discourse-ai/lib/ai_moderation/spam_scanner.rb
Sam 32dc45ba4f
FIX: never block spam scanning user (#1437)
Previously staff and bots would get scanned if TL was low
Additionally if somehow spam scanner user was blocked
(deactivated, silenced, banned) it would stop the feature from working

This adds an override that ensures unconditionally the user is setup correctly prior to scanning
2025-06-17 14:51:27 +10:00

454 lines
15 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module AiModeration
class SpamScanner
POSTS_TO_SCAN = 3
MINIMUM_EDIT_DIFFERENCE = 10
EDIT_DELAY_MINUTES = 10
MAX_AGE_TO_SCAN = 1.day
MAX_RAW_SCAN_LENGTH = 5000
SHOULD_SCAN_POST_CUSTOM_FIELD = "discourse_ai_should_scan_post"
def self.new_post(post)
return if !enabled?
return if !should_scan_post?(post)
flag_post_for_scanning(post)
end
def self.ensure_flagging_user!
if !SiteSetting.ai_spam_detection_user_id.present?
User.transaction do
# prefer a "high" id for this bot
id = User.where("id > -20").minimum(:id) - 1
id = User.minimum(:id) - 1 if id == -100
user =
User.create!(
id: id,
username: UserNameSuggester.suggest("discourse_ai_spam"),
name: "Discourse AI Spam Scanner",
email: "#{SecureRandom.hex(10)}@invalid.invalid",
active: true,
approved: true,
trust_level: TrustLevel[4],
admin: true,
)
Group.user_trust_level_change!(user.id, user.trust_level)
SiteSetting.ai_spam_detection_user_id = user.id
end
end
end
def self.flagging_user
user = nil
if SiteSetting.ai_spam_detection_user_id.present?
user = User.find_by(id: SiteSetting.ai_spam_detection_user_id)
ensure_safe_flagging_user!(user)
end
user || Discourse.system_user
end
def self.ensure_safe_flagging_user!(user)
# only do repair on bot users, if somehow it is set to a human skip repairs
return if !user.bot?
user.update!(silenced_till: nil) if user.silenced?
user.update!(trust_level: TrustLevel[4]) if user.trust_level != TrustLevel[4]
user.update!(suspended_till: nil, suspended_at: nil) if user.suspended?
user.update!(active: true) if !user.active?
end
def self.after_cooked_post(post)
return if !enabled?
return if !should_scan_post?(post)
return if !post.custom_fields[SHOULD_SCAN_POST_CUSTOM_FIELD]
return if post.updated_at < MAX_AGE_TO_SCAN.ago
last_scan = AiSpamLog.where(post_id: post.id).order(created_at: :desc).first
if last_scan && last_scan.created_at > EDIT_DELAY_MINUTES.minutes.ago
delay_minutes =
((last_scan.created_at + EDIT_DELAY_MINUTES.minutes) - Time.current).to_i / 60
Jobs.enqueue_in(delay_minutes.minutes, :ai_spam_scan, post_id: post.id)
else
Jobs.enqueue(:ai_spam_scan, post_id: post.id)
end
end
def self.edited_post(post)
return if !enabled?
return if !should_scan_post?(post)
return if scanned_max_times?(post)
previous_version = post.revisions.last&.modifications&.dig("raw", 0)
current_version = post.raw
return if !significant_change?(previous_version, current_version)
flag_post_for_scanning(post)
end
def self.flag_post_for_scanning(post)
post.custom_fields[SHOULD_SCAN_POST_CUSTOM_FIELD] = "true"
post.save_custom_fields
end
def self.enabled?
SiteSetting.ai_spam_detection_enabled && SiteSetting.discourse_ai_enabled
end
def self.should_scan_post?(post)
return false if !post.present?
return false if post.user.trust_level > TrustLevel[1]
return false if post.topic.private_message?
return false if post.user.bot?
return false if post.user.staff?
if Post
.where(user_id: post.user_id)
.joins(:topic)
.where(topic: { archetype: Archetype.default })
.limit(4)
.count > 3
return false
end
true
end
def self.scanned_max_times?(post)
AiSpamLog.where(post_id: post.id).count >= 3
end
def self.significant_change?(previous_version, current_version)
return true if previous_version.nil? # First edit should be scanned
# Use Discourse's built-in levenshtein implementation
distance =
ScreenedEmail.levenshtein(previous_version.to_s[0...1000], current_version.to_s[0...1000])
distance >= MINIMUM_EDIT_DIFFERENCE
end
def self.test_post(post, custom_instructions: nil, llm_id: nil)
settings = AiModerationSetting.spam
llm_model = llm_id ? LlmModel.find(llm_id) : settings.llm_model
llm = llm_model.to_llm
custom_instructions = custom_instructions || settings.custom_instructions.presence
context = build_context(post, post.topic || Topic.with_deleted.find_by(id: post.topic_id))
prompt = completion_prompt(post, context: context, custom_instructions: custom_instructions)
result =
llm.generate(
prompt,
temperature: 0.1,
max_tokens: 5,
user: Discourse.system_user,
feature_name: "spam_detection_test",
feature_context: {
post_id: post.id,
},
)&.strip
history = nil
AiSpamLog
.where(post: post)
.order(:created_at)
.limit(100)
.each do |log|
history ||= +"Scan History:\n"
history << "date: #{log.created_at} is_spam: #{log.is_spam}\n"
end
log = +"Scanning #{post.url}\n\n"
if history
log << history
log << "\n"
end
log << "LLM: #{llm_model.name}\n\n"
log << "System Prompt: #{build_system_prompt(custom_instructions)}\n\n"
log << "Context: #{context}\n\n"
is_spam = check_if_spam(result)
prompt.push(type: :model, content: result)
prompt.push(type: :user, content: "Explain your reasoning")
reasoning =
llm.generate(
prompt,
temperature: 0.1,
max_tokens: 100,
user: Discourse.system_user,
feature_name: "spam_detection_test",
feature_context: {
post_id: post.id,
},
)&.strip
log << "#{reasoning}"
{ is_spam: is_spam, log: log }
end
def self.completion_prompt(post, context:, custom_instructions:)
system_prompt = build_system_prompt(custom_instructions)
prompt = DiscourseAi::Completions::Prompt.new(system_prompt)
args = { type: :user, content: context }
upload_ids = post.upload_ids
if upload_ids.present?
args[:content] = [args[:content]]
upload_ids.take(3).each { |upload_id| args[:content] << { upload_id: upload_id } }
end
prompt.push(**args)
prompt
end
def self.perform_scan(post)
return if !should_scan_post?(post)
perform_scan!(post)
end
def self.perform_scan!(post)
return if !enabled?
settings = AiModerationSetting.spam
return if !settings || !settings.llm_model
context = build_context(post)
llm = settings.llm_model.to_llm
custom_instructions = settings.custom_instructions.presence
prompt = completion_prompt(post, context: context, custom_instructions: custom_instructions)
begin
result =
llm.generate(
prompt,
temperature: 0.1,
max_tokens: 5,
user: Discourse.system_user,
feature_name: "spam_detection",
feature_context: {
post_id: post.id,
},
)&.strip
is_spam = check_if_spam(result)
log = AiApiAuditLog.order(id: :desc).where(feature_name: "spam_detection").first
AiSpamLog.transaction do
log =
AiSpamLog.create!(
post: post,
llm_model: settings.llm_model,
ai_api_audit_log: log,
is_spam: is_spam,
payload: context,
)
handle_spam(post, log) if is_spam
end
rescue StandardError => e
# we need retries otherwise stuff will not be handled
Discourse.warn_exception(
e,
message: "Discourse AI: Error in SpamScanner for post #{post.id}",
)
raise e
end
end
def self.fix_spam_scanner_not_admin
user = DiscourseAi::AiModeration::SpamScanner.flagging_user
if user.present?
user.update!(admin: true)
else
raise Discourse::NotFound
end
end
private
def self.check_if_spam(result)
(result.present? && result.strip.downcase.start_with?("spam"))
end
def self.build_context(post, topic = nil)
topic ||= post.topic
context = []
# Clear distinction between reply and new topic
if post.is_first_post?
context << "NEW TOPIC POST ANALYSIS"
context << "- Topic title: #{topic.title}"
context << "- Category: #{topic.category&.name}"
else
context << "REPLY POST ANALYSIS"
context << "- In topic: #{topic.title}"
context << "- Category: #{topic.category&.name}"
context << "- Topic started by: #{topic.user&.username}"
if post.reply_to_post_number.present?
parent =
Post.with_deleted.find_by(topic_id: topic.id, post_number: post.reply_to_post_number)
if parent
context << "\nReplying to #{parent.user&.username}'s post:"
context << "#{parent.raw[0..500]}..." if parent.raw.length > 500
context << parent.raw if parent.raw.length <= 500
end
end
end
context << "\nPost Author Information:"
if user = post.user # during test we may not have a user
context << "- Username: #{user.username}\n"
context << "- Email: #{user.email}\n"
context << "- Account age: #{(Time.current - user.created_at).to_i / 86_400} days\n"
context << "- Total posts: #{user.post_count}\n"
context << "- Trust level: #{user.trust_level}\n"
if info = location_info(user)
context << "- Registration Location: #{info[:registration]}\n" if info[:registration]
context << "- Last Location: #{info[:last]}\n" if info[:last]
end
end
context << "\nPost Content (first #{MAX_RAW_SCAN_LENGTH} chars):\n"
context << post.raw[0..MAX_RAW_SCAN_LENGTH]
context.join("\n")
end
def self.location_info(user)
registration, last = nil
if user.ip_address.present?
info = DiscourseIpInfo.get(user.ip_address, resolve_hostname: true)
last = "#{info[:location]} (#{info[:organization]})" if info && info[:location].present?
end
if user.registration_ip_address.present?
info = DiscourseIpInfo.get(user.registration_ip_address, resolve_hostname: true)
registration = "#{info[:location]} (#{info[:organization]})" if info &&
info[:location].present?
end
rval = nil
if registration || last
rval = { registration: registration } if registration
if last && last != registration
rval ||= {}
rval[:last] = last
end
end
rval
rescue => e
Discourse.warn_exception(e, message: "Failed to lookup location info")
nil
end
def self.build_system_prompt(custom_instructions)
base_prompt = +<<~PROMPT
You are a spam detection system. Analyze the following post content and context.
Respond with "SPAM" if the post is spam, or "NOT_SPAM" if it's legitimate.
- ALWAYS lead your reply with the word SPAM or NOT_SPAM - you are consumed via an API
Consider the post type carefully:
- For REPLY posts: Check if the response is relevant and topical to the thread
- For NEW TOPIC posts: Check if it's a legitimate topic or spam promotion
A post is spam if it matches any of these criteria:
- Contains unsolicited commercial content or promotions
- Has suspicious or unrelated external links
- Shows patterns of automated/bot posting
- Contains irrelevant content or advertisements
- For replies: Completely unrelated to the discussion thread
- Uses excessive keywords or repetitive text patterns
- Shows suspicious formatting or character usage
Be especially strict with:
- Replies that ignore the previous conversation
- Posts containing multiple unrelated external links
- Generic responses that could be posted anywhere
Be fair to:
- New users making legitimate first contributions
- Non-native speakers making genuine efforts to participate
- Topic-relevant product mentions in appropriate contexts
PROMPT
base_prompt << "\n\n"
base_prompt << <<~SITE_SPECIFIC
Site Specific Information:
- Site name: #{SiteSetting.title}
- Site URL: #{Discourse.base_url}
- Site description: #{SiteSetting.site_description}
- Site top 10 categories: #{Category.where(read_restricted: false).order(posts_year: :desc).limit(10).pluck(:name).join(", ")}
SITE_SPECIFIC
if custom_instructions.present?
base_prompt << "\n\nAdditional site-specific instructions provided by Staff:\n#{custom_instructions}"
end
base_prompt
end
def self.handle_spam(post, log)
url = "#{Discourse.base_url}/admin/plugins/discourse-ai/ai-spam"
reason = I18n.t("discourse_ai.spam_detection.flag_reason", url: url)
flagging_user = self.flagging_user
result =
PostActionCreator.new(
flagging_user,
post,
PostActionType.types[:spam],
reason: reason,
queue_for_review: true,
).perform
# Currently in core re-flagging something that is already flagged as spam
# is not supported, long term we may want to support this but in the meantime
# we should not be silencing/hiding if the PostActionCreator fails.
if result.success?
log.update!(reviewable: result.reviewable)
reason = I18n.t("discourse_ai.spam_detection.silence_reason", url: url)
silencer =
UserSilencer.new(
post.user,
flagging_user,
message: :too_many_spam_flags,
post_id: post.id,
reason: reason,
keep_posts: true,
)
silencer.silence
# silencer will not hide tl1 posts, so we do this here
hide_post(post)
else
log.update!(
error:
"unable to flag post as spam, post action failed for post #{post.id} with error: '#{result.errors.full_messages.join(", ").truncate(3000)}'",
)
end
end
def self.hide_post(post)
Post.where(id: post.id).update_all(
[
"hidden = true, hidden_reason_id = COALESCE(hidden_reason_id, ?)",
Post.hidden_reasons[:new_user_spam_threshold_reached],
],
)
Topic.where(id: post.topic_id).update_all(visible: false) if post.post_number == 1
end
end
end
end