discourse-ai/spec/services/discourse_ai/topic_summarization_spec.rb
Roman Rizzi 46fcdb6ba5
FIX: Make summaries backfill job more resilient. (#1071)
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included.

Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates.

The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future.

Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
2025-01-16 09:42:53 -03:00

135 lines
4.8 KiB
Ruby

# frozen_string_literal: true
describe DiscourseAi::TopicSummarization do
fab!(:user) { Fabricate(:admin) }
fab!(:topic) { Fabricate(:topic, highest_post_number: 2) }
fab!(:post_1) { Fabricate(:post, topic: topic, post_number: 1) }
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2) }
before do
assign_fake_provider_to(:ai_summarization_model)
SiteSetting.ai_summarization_enabled = true
end
let(:strategy) { DiscourseAi::Summarization.topic_summary(topic) }
describe "#summarize" do
subject(:summarization) { described_class.new(strategy, user) }
def assert_summary_is_cached(topic, summary_response)
cached_summary =
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete])
expect(cached_summary.highest_target_number).to eq(topic.highest_post_number)
expect(cached_summary.summarized_text).to eq(summary)
expect(cached_summary.original_content_sha).to be_present
expect(cached_summary.algorithm).to eq("fake")
end
context "when the content was summarized in a single chunk" do
let(:summary) { "This is the final summary" }
it "caches the summary" do
DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
section = summarization.summarize
expect(section.summarized_text).to eq(summary)
assert_summary_is_cached(topic, summary)
end
end
it "returns the cached version in subsequent calls" do
summarization.summarize
cached_summary_text = "This is a cached summary"
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete]).update!(
summarized_text: cached_summary_text,
updated_at: 24.hours.ago,
)
summarization = described_class.new(strategy, user)
section = summarization.summarize
expect(section.summarized_text).to eq(cached_summary_text)
end
end
describe "invalidating cached summaries" do
let(:cached_text) { "This is a cached summary" }
let(:updated_summary) { "This is the final summary" }
def cached_summary
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete])
end
before do
# a bit tricky, but fold_content now caches an instance of LLM
# once it is cached with_prepared_responses will not work as expected
# since it is glued to the old llm instance
# so we create the cached summary totally independantly
DiscourseAi::Completions::Llm.with_prepared_responses([cached_text]) do
strategy = DiscourseAi::Summarization.topic_summary(topic)
described_class.new(strategy, user).summarize
end
cached_summary.update!(summarized_text: cached_text, created_at: 24.hours.ago)
end
context "when the user can requests new summaries" do
context "when there are no new posts" do
it "returns the cached summary" do
section = summarization.summarize
expect(section.summarized_text).to eq(cached_text)
end
end
context "when there are new posts" do
before { cached_summary.update!(original_content_sha: "outdated_sha") }
it "returns a new summary" do
DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
section = summarization.summarize
expect(section.summarized_text).to eq(updated_summary)
end
end
context "when the cached summary is less than one hour old" do
before { cached_summary.update!(created_at: 30.minutes.ago) }
it "returns the cached summary" do
cached_summary.update!(created_at: 30.minutes.ago)
section = summarization.summarize
expect(section.summarized_text).to eq(cached_text)
expect(section.outdated).to eq(true)
end
it "returns a new summary if the skip_age_check flag is passed" do
DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
section = summarization.summarize(skip_age_check: true)
expect(section.summarized_text).to eq(updated_summary)
end
end
end
end
end
end
describe "stream partial updates" do
let(:summary) { "This is the final summary" }
it "receives a blk that is passed to the underlying strategy and called with partial summaries" do
partial_result = +""
DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
summarization.summarize { |partial_summary| partial_result << partial_summary }
end
expect(partial_result).to eq(summary)
end
end
end
end