discourse-ai/app/jobs/scheduled/summaries_backfill.rb
Roman Rizzi 1b1b44353b
FEATURE: Changes to summaries' outdated logic. (#1108)
Before this change, a summary was only outdated when new content appeared, for topics with "best replies", when the query returned different results. The intent behind this change is to detect when a summary is outdated as a result of an edit.

Additionally, we are changing the backfill candidates query to compare "ai_summary_backfill_topic_max_age_days" against "last_posted_at" instead of "created_at", to catch long-lived, active topics. This was discussed here: https://meta.discourse.org/t/ai-summarization-backfill-is-stuck-keeps-regenerating-the-same-topic/347088/14?u=roman_rizzi
2025-02-04 09:31:11 -03:00

87 lines
3.1 KiB
Ruby

# frozen_string_literal: true
module ::Jobs
class SummariesBackfill < ::Jobs::Scheduled
every 5.minutes
cluster_concurrency 1
def execute(_args)
return if !SiteSetting.discourse_ai_enabled
return if !SiteSetting.ai_summarization_enabled
return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero?
system_user = Discourse.system_user
if SiteSetting.ai_summary_gists_enabled
gist_t = AiSummary.summary_types[:gist]
backfill_candidates(gist_t)
.limit(current_budget(gist_t))
.each do |topic|
strategy = DiscourseAi::Summarization.topic_gist(topic)
try_summarize(strategy, system_user, topic)
end
end
complete_t = AiSummary.summary_types[:complete]
backfill_candidates(complete_t)
.limit(current_budget(complete_t))
.each do |topic|
strategy = DiscourseAi::Summarization.topic_summary(topic)
try_summarize(strategy, system_user, topic)
end
end
def try_summarize(strategy, user, topic)
existing_summary = strategy.existing_summary
if existing_summary.blank? || existing_summary.outdated
strategy.summarize(user)
else
# Hiding or deleting a post, and creating a small action alters the Topic#highest_post_number.
# We use this as a quick way to select potential backfill candidates without relying on original_content_sha.
# At this point, we are confident the summary doesn't need to be regenerated so something other than a regular reply
# caused the number to change in the topic.
existing_summary.update!(highest_target_number: topic.highest_post_number)
end
end
def backfill_candidates(summary_type)
max_age_days = SiteSetting.ai_summary_backfill_topic_max_age_days
Topic
.where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count)
.joins(<<~SQL)
LEFT OUTER JOIN ai_summaries ais ON
topics.id = ais.target_id AND
ais.target_type = 'Topic' AND
ais.summary_type = '#{summary_type}'
SQL
.where("topics.last_posted_at > current_timestamp - INTERVAL '#{max_age_days.to_i} DAY'")
.where(
<<~SQL, # (1..1) gets stored ad (1..2).
ais.id IS NULL OR (
ais.highest_target_number < topics.highest_post_number
AND ais.updated_at < (current_timestamp - INTERVAL '5 minutes')
)
SQL
)
.order("ais.updated_at DESC NULLS FIRST, topics.last_posted_at DESC")
end
def current_budget(type)
# Split budget in 12 intervals, but make sure is at least one.
base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour
limit_per_job = [base_budget, 12].max / 12
used_budget =
AiSummary.system.where("created_at > ?", 1.hour.ago).where(summary_type: type).count
current_budget = [(base_budget - used_budget), limit_per_job].min
return 0 if current_budget < 0
current_budget
end
end
end