FEATURE: link correctly to filters to assist in debugging spam (#1031)

- Add spam_score_type to AiSpamSerializer for better integration with reviewables.
- Introduce a custom filter for detecting AI spam false negatives in moderation workflows.
- Refactor spam report generation to improve identification of false negatives.
- Add tests to verify the custom filter and its behavior.
- Introduce links for all spam counts in report
This commit is contained in:
Sam 2024-12-17 11:02:18 +11:00 committed by GitHub
parent 90ce942108
commit fae2d5ff2c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 118 additions and 28 deletions

View File

@ -1,7 +1,13 @@
# frozen_string_literal: true # frozen_string_literal: true
class AiSpamSerializer < ApplicationSerializer class AiSpamSerializer < ApplicationSerializer
attributes :is_enabled, :llm_id, :custom_instructions, :available_llms, :stats, :flagging_username attributes :is_enabled,
:llm_id,
:custom_instructions,
:available_llms,
:stats,
:flagging_username,
:spam_score_type
def is_enabled def is_enabled
object[:enabled] object[:enabled]
@ -25,6 +31,10 @@ class AiSpamSerializer < ApplicationSerializer
object[:flagging_username] object[:flagging_username]
end end
def spam_score_type
ReviewableScore.types[:spam]
end
def stats def stats
{ {
scanned_count: object[:stats].scanned_count.to_i, scanned_count: object[:stats].scanned_count.to_i,

View File

@ -125,9 +125,30 @@ export default class AiSpam extends Component {
label: i18n("discourse_ai.spam.spam_detected"), label: i18n("discourse_ai.spam.spam_detected"),
value: this.stats.spam_detected, value: this.stats.spam_detected,
}; };
const falsePositives = {
label: i18n("discourse_ai.spam.false_positives"),
value: this.stats.false_positives,
tooltip: i18n("discourse_ai.spam.stat_tooltips.incorrectly_flagged"),
};
const falseNegatives = {
label: i18n("discourse_ai.spam.false_negatives"),
value: this.stats.false_negatives,
tooltip: i18n("discourse_ai.spam.stat_tooltips.missed_spam"),
};
if (this.args.model.flagging_username) { if (this.args.model.flagging_username) {
detected.href = getURL( detected.href = getURL(
"/review?flagged_by=" + this.args.model.flagging_username `/review?flagged_by=${this.args.model.flagging_username}&status=all&sort_order=created_at`
);
falsePositives.href = getURL(
`/review?flagged_by=${this.args.model.flagging_username}&status=rejected&sort_order=created_at`
);
falseNegatives.href = getURL(
`/review?status=approved&sort_order=created_at&additional_filters={"ai_spam_false_negative":true}&order=created&score_type=${this.args.model.spam_score_type}`
); );
} }
return [ return [
@ -136,16 +157,8 @@ export default class AiSpam extends Component {
value: this.stats.scanned_count, value: this.stats.scanned_count,
}, },
detected, detected,
{ falsePositives,
label: i18n("discourse_ai.spam.false_positives"), falseNegatives,
value: this.stats.false_positives,
tooltip: i18n("discourse_ai.spam.stat_tooltips.incorrectly_flagged"),
},
{
label: i18n("discourse_ai.spam.false_negatives"),
value: this.stats.false_negatives,
tooltip: i18n("discourse_ai.spam.stat_tooltips.missed_spam"),
},
]; ];
} }

View File

@ -11,6 +11,25 @@ module DiscourseAi
plugin.on(:site_setting_changed) do |name, _old_value, new_value| plugin.on(:site_setting_changed) do |name, _old_value, new_value|
SpamScanner.ensure_flagging_user! if name == :ai_spam_detection_enabled && new_value SpamScanner.ensure_flagging_user! if name == :ai_spam_detection_enabled && new_value
end end
custom_filter = [
:ai_spam_false_negative,
Proc.new do |results, value|
if value
results.where(<<~SQL)
EXISTS (
SELECT 1 FROM ai_spam_logs
WHERE NOT is_spam
AND post_id = target_id AND target_type = 'Post'
)
SQL
else
results
end
end,
]
Reviewable.add_custom_filter(custom_filter)
end end
end end
end end

View File

@ -14,33 +14,34 @@ module DiscourseAi
asl.post_id, asl.post_id,
asl.is_spam, asl.is_spam,
r.status as reviewable_status, r.status as reviewable_status,
r.target_type, CASE WHEN EXISTS (
r.potential_spam SELECT 1 FROM reviewable_scores rs
JOIN reviewables r1 ON r1.id = rs.reviewable_id
WHERE r1.target_id = asl.post_id
AND r1.target_type = 'Post'
AND rs.reviewable_score_type = :spam_score_type
AND NOT is_spam
AND r1.status IN (:spam)
) THEN true ELSE false END AS missed_spam
FROM ai_spam_logs asl FROM ai_spam_logs asl
LEFT JOIN reviewables r ON r.id = asl.reviewable_id LEFT JOIN reviewables r ON r.id = asl.reviewable_id
WHERE asl.created_at > :min_date WHERE asl.created_at > :min_date
),
post_reviewables AS (
SELECT
target_id post_id,
COUNT(DISTINCT target_id) as false_negative_count
FROM reviewables
WHERE target_type = 'Post'
AND status IN (:spam)
AND potential_spam
AND target_id IN (SELECT post_id FROM spam_stats)
GROUP BY target_id
) )
SELECT SELECT
COUNT(*) AS scanned_count, COUNT(*) AS scanned_count,
SUM(CASE WHEN is_spam THEN 1 ELSE 0 END) AS spam_detected, SUM(CASE WHEN is_spam THEN 1 ELSE 0 END) AS spam_detected,
COUNT(CASE WHEN reviewable_status IN (:ham) THEN 1 END) AS false_positives, COUNT(CASE WHEN reviewable_status IN (:ham) THEN 1 END) AS false_positives,
COALESCE(SUM(pr.false_negative_count), 0) AS false_negatives COUNT(CASE WHEN missed_spam THEN 1 END) AS false_negatives
FROM spam_stats FROM spam_stats
LEFT JOIN post_reviewables pr USING (post_id)
SQL SQL
DB.query(sql, spam: spam_status, ham: ham_status, min_date: min_date).first DB.query(
sql,
spam: spam_status,
ham: ham_status,
min_date: min_date,
spam_score_type: ReviewableScore.types[:spam],
).first
end end
end end
end end

View File

@ -0,0 +1,47 @@
# frozen_string_literal: true
RSpec.describe ReviewablesController do
fab!(:post1) { Fabricate(:post) }
fab!(:post2) { Fabricate(:post) }
fab!(:admin)
fab!(:llm_model)
fab!(:reviewable) do
Reviewable.create!(
target: post1,
topic: post2.topic,
type: ReviewablePost,
created_by: admin,
status: Reviewable.statuses[:pending],
)
end
fab!(:reviewable2) do
Reviewable.create!(
target: post2,
topic: post2.topic,
type: ReviewablePost,
created_by: admin,
status: Reviewable.statuses[:pending],
)
end
fab!(:ai_spam_log_missed) do
AiSpamLog.create!(is_spam: false, post_id: post1.id, llm_model_id: llm_model.id)
end
# we amend the behavior with a custom filter so we need to confirm it works
it "properly applies custom filter" do
sign_in(admin)
get '/review.json?additional_filters={"ai_spam_false_negative":true}'
expect(response.status).to eq(200)
json = JSON.parse(response.body)
expect(json["reviewables"].length).to eq(1)
get "/review.json"
expect(response.status).to eq(200)
json = JSON.parse(response.body)
expect(json["reviewables"].length).to eq(2)
end
end