FEATURE: Changes to summaries' outdated logic. (#1108)

Before this change, a summary was only outdated when new content appeared, for topics with "best replies", when the query returned different results. The intent behind this change is to detect when a summary is outdated as a result of an edit.

Additionally, we are changing the backfill candidates query to compare "ai_summary_backfill_topic_max_age_days" against "last_posted_at" instead of "created_at", to catch long-lived, active topics. This was discussed here: https://meta.discourse.org/t/ai-summarization-backfill-is-stuck-keeps-regenerating-the-same-topic/347088/14?u=roman_rizzi
This commit is contained in:
Roman Rizzi 2025-02-04 09:31:11 -03:00 committed by GitHub
parent d3b93f984d
commit 1b1b44353b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 59 additions and 21 deletions

View File

@ -57,7 +57,7 @@ module ::Jobs
ais.target_type = 'Topic' AND
ais.summary_type = '#{summary_type}'
SQL
.where("topics.created_at > current_timestamp - INTERVAL '#{max_age_days.to_i} DAY'")
.where("topics.last_posted_at > current_timestamp - INTERVAL '#{max_age_days.to_i} DAY'")
.where(
<<~SQL, # (1..1) gets stored ad (1..2).
ais.id IS NULL OR (

View File

@ -58,8 +58,9 @@ module DiscourseAi
if summary
@existing_summary = summary
if existing_summary.original_content_sha != latest_sha
@existing_summary.mark_as_outdated
if summary.original_content_sha != latest_sha ||
content_to_summarize.any? { |cts| cts[:last_version_at] > summary.updated_at }
summary.mark_as_outdated
end
end
end

View File

@ -23,8 +23,8 @@ module DiscourseAi
.where("chat_messages.created_at > ?", since.hours.ago)
.includes(:user)
.order(created_at: :asc)
.pluck(:id, :username_lower, :message)
.map { { id: _1, poster: _2, text: _3 } }
.pluck(:id, :username_lower, :message, :updated_at)
.map { { id: _1, poster: _2, text: _3, last_version_at: _4 } }
end
def summary_extension_prompt(summary, contents)

View File

@ -49,16 +49,16 @@ module DiscourseAi
.joins(:user)
.where("post_number IN (?)", recent_hot_posts << op_post_number)
.order(:post_number)
.pluck(:post_number, :raw, :username)
.pluck(:post_number, :raw, :username, :last_version_at)
posts_data.reduce([]) do |memo, (pn, raw, username)|
posts_data.reduce([]) do |memo, (pn, raw, username, last_version_at)|
raw_text = raw
if pn == 1 && target.topic_embed&.embed_content_cache.present?
raw_text = target.topic_embed&.embed_content_cache
end
memo << { poster: username, id: pn, text: raw_text }
memo << { poster: username, id: pn, text: raw_text, last_version_at: last_version_at }
end
end

View File

@ -18,16 +18,17 @@ module DiscourseAi
:post_number,
:raw,
:username,
:last_version_at,
)
posts_data.reduce([]) do |memo, (pn, raw, username)|
posts_data.reduce([]) do |memo, (pn, raw, username, last_version_at)|
raw_text = raw
if pn == 1 && target.topic_embed&.embed_content_cache.present?
raw_text = target.topic_embed&.embed_content_cache
end
memo << { poster: username, id: pn, text: raw_text }
memo << { poster: username, id: pn, text: raw_text, last_version_at: last_version_at }
end
end

View File

@ -1,7 +1,9 @@
# frozen_string_literal: true
RSpec.describe Jobs::SummariesBackfill do
fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) }
fab!(:topic) do
Fabricate(:topic, word_count: 200, highest_post_number: 2, last_posted_at: 2.hours.ago)
end
let(:limit) { 24 } # guarantee two summaries per batch
let(:intervals) { 12 } # budget is split into intervals. Job runs every five minutes.
@ -73,7 +75,7 @@ RSpec.describe Jobs::SummariesBackfill do
it "respects max age setting" do
SiteSetting.ai_summary_backfill_topic_max_age_days = 1
topic.update!(created_at: 2.days.ago)
topic.update!(last_posted_at: 2.days.ago)
expect(subject.backfill_candidates(type)).to be_empty
end
@ -112,14 +114,14 @@ RSpec.describe Jobs::SummariesBackfill do
end
it "updates the highest_target_number if the summary turned to be up to date" do
og_highest_post_number = topic.highest_post_number
existing_summary =
Fabricate(
:ai_summary,
target: topic,
updated_at: 3.hours.ago,
highest_target_number: topic.highest_post_number,
highest_target_number: og_highest_post_number,
)
og_highest_post_number = topic.highest_post_number
topic.update!(highest_post_number: og_highest_post_number + 1)
# No prepared responses here. We don't perform a completion call.

View File

@ -3,15 +3,15 @@
RSpec.describe DiscourseAi::Summarization::FoldContent do
subject(:summarizer) { DiscourseAi::Summarization.topic_summary(topic) }
let!(:llm_model) { assign_fake_provider_to(:ai_summarization_model) }
fab!(:topic) { Fabricate(:topic, highest_post_number: 2) }
fab!(:post_1) { Fabricate(:post, topic: topic, post_number: 1, raw: "This is a text") }
before { SiteSetting.ai_summarization_enabled = true }
describe "#summarize" do
let!(:llm_model) { assign_fake_provider_to(:ai_summarization_model) }
fab!(:topic) { Fabricate(:topic, highest_post_number: 2) }
fab!(:post_1) { Fabricate(:post, topic: topic, post_number: 1, raw: "This is a text") }
before do
SiteSetting.ai_summarization_enabled = true
# Make sure each content fits in a single chunk.
# 700 is the number of tokens reserved for the prompt.
model_tokens =
@ -52,4 +52,38 @@ RSpec.describe DiscourseAi::Summarization::FoldContent do
end
end
end
describe "#existing_summary" do
context "when a summary already exists" do
fab!(:ai_summary) do
Fabricate(
:ai_summary,
target: topic,
highest_target_number: topic.highest_post_number,
original_content_sha: AiSummary.build_sha("1"),
)
end
it "doesn't mark it as outdated" do
expect(summarizer.existing_summary.outdated).to eq(false)
end
context "when it's outdated because there are new targets" do
before { Fabricate(:post, topic: topic, post_number: 2, raw: "This is a text") }
it "marks it as outdated" do
expect(summarizer.existing_summary.outdated).to eq(true)
end
end
context "when it's outdated because existing content changes" do
it "marks it as outdated" do
ai_summary.update!(updated_at: 20.minutes.ago)
post_1.update!(last_version_at: 5.minutes.ago)
expect(summarizer.existing_summary.outdated).to eq(true)
end
end
end
end
end