From 46fcdb6ba5c4c6b15b8030330c652783af268813 Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Thu, 16 Jan 2025 09:42:53 -0300 Subject: [PATCH] FIX: Make summaries backfill job more resilient. (#1071) To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included. Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates. The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future. Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on --- app/jobs/regular/fast_track_topic_gist.rb | 2 +- app/jobs/scheduled/summaries_backfill.rb | 26 ++++++-- app/models/ai_summary.rb | 35 +++++++---- .../ai_topic_summary_serializer.rb | 11 +--- ...add_highest_target_number_to_ai_summary.rb | 14 +++++ ...5181147_drop_ai_summaries_content_range.rb | 12 ++++ lib/summarization/fold_content.rb | 7 +-- lib/summarization/strategies/chat_messages.rb | 4 ++ .../strategies/hot_topic_gists.rb | 5 ++ lib/summarization/strategies/topic_summary.rb | 4 ++ spec/fabricators/ai_summary_fabricator.rb | 1 + .../jobs/scheduled/summaries_backfill_spec.rb | 42 ++++++++++--- spec/lib/guardian_extensions_spec.rb | 16 +---- .../summarization/summary_controller_spec.rb | 63 +++++++------------ .../discourse_ai/topic_summarization_spec.rb | 2 +- .../summarization/topic_summarization_spec.rb | 21 +------ 16 files changed, 149 insertions(+), 116 deletions(-) create mode 100644 db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb create mode 100644 db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb diff --git a/app/jobs/regular/fast_track_topic_gist.rb b/app/jobs/regular/fast_track_topic_gist.rb index 8885bee3..a20d1817 100644 --- a/app/jobs/regular/fast_track_topic_gist.rb +++ b/app/jobs/regular/fast_track_topic_gist.rb @@ -16,7 +16,7 @@ module ::Jobs gist = summarizer.existing_summary return if gist.present? && (!gist.outdated || gist.created_at >= 5.minutes.ago) - summarizer.force_summarize(Discourse.system_user) + summarizer.summarize(Discourse.system_user) end end end diff --git a/app/jobs/scheduled/summaries_backfill.rb b/app/jobs/scheduled/summaries_backfill.rb index 34ea2d53..10dcc719 100644 --- a/app/jobs/scheduled/summaries_backfill.rb +++ b/app/jobs/scheduled/summaries_backfill.rb @@ -18,7 +18,8 @@ module ::Jobs backfill_candidates(gist_t) .limit(current_budget(gist_t)) .each do |topic| - DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user) + strategy = DiscourseAi::Summarization.topic_gist(topic) + try_summarize(strategy, system_user, topic) end end @@ -26,10 +27,25 @@ module ::Jobs backfill_candidates(complete_t) .limit(current_budget(complete_t)) .each do |topic| - DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user) + strategy = DiscourseAi::Summarization.topic_summary(topic) + try_summarize(strategy, system_user, topic) end end + def try_summarize(strategy, user, topic) + existing_summary = strategy.existing_summary + + if existing_summary.blank? || existing_summary.outdated + strategy.summarize(user) + else + # Hiding or deleting a post, and creating a small action alters the Topic#highest_post_number. + # We use this as a quick way to select potential backfill candidates without relying on original_content_sha. + # At this point, we are confident the summary doesn't need to be regenerated so something other than a regular reply + # caused the number to change in the topic. + existing_summary.update!(highest_target_number: topic.highest_post_number) + end + end + def backfill_candidates(summary_type) max_age_days = SiteSetting.ai_summary_backfill_topic_max_age_days @@ -45,12 +61,12 @@ module ::Jobs .where( <<~SQL, # (1..1) gets stored ad (1..2). ais.id IS NULL OR ( - UPPER(ais.content_range) < topics.highest_post_number + 1 - AND ais.created_at < (current_timestamp - INTERVAL '5 minutes') + ais.highest_target_number < topics.highest_post_number + AND ais.updated_at < (current_timestamp - INTERVAL '5 minutes') ) SQL ) - .order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC") + .order("ais.updated_at DESC NULLS FIRST, topics.last_posted_at DESC") end def current_budget(type) diff --git a/app/models/ai_summary.rb b/app/models/ai_summary.rb index 30b23351..458c5f18 100644 --- a/app/models/ai_summary.rb +++ b/app/models/ai_summary.rb @@ -1,6 +1,9 @@ # frozen_string_literal: true class AiSummary < ActiveRecord::Base + # TODO remove this line 01-3-2025 + self.ignored_columns = %i[content_range] + belongs_to :target, polymorphic: true enum :summary_type, { complete: 0, gist: 1 } @@ -15,14 +18,20 @@ class AiSummary < ActiveRecord::Base target_id: strategy.target.id, target_type: strategy.target.class.name, algorithm: llm_model.name, - content_range: (content_ids.first..content_ids.last), + highest_target_number: strategy.highest_target_number, summarized_text: summary, original_content_sha: build_sha(content_ids.join), summary_type: strategy.type, origin: !!human ? origins[:human] : origins[:system], }, unique_by: %i[target_id target_type summary_type], - update_only: %i[summarized_text original_content_sha algorithm origin content_range], + update_only: %i[ + summarized_text + original_content_sha + algorithm + origin + highest_target_number + ], ) .first .then { AiSummary.find_by(id: _1["id"]) } @@ -45,17 +54,17 @@ end # # Table name: ai_summaries # -# id :bigint not null, primary key -# target_id :integer not null -# target_type :string not null -# content_range :int4range -# summarized_text :string not null -# original_content_sha :string not null -# algorithm :string not null -# created_at :datetime not null -# updated_at :datetime not null -# summary_type :integer default("complete"), not null -# origin :integer +# id :bigint not null, primary key +# target_id :integer not null +# target_type :string not null +# summarized_text :string not null +# original_content_sha :string not null +# algorithm :string not null +# created_at :datetime not null +# updated_at :datetime not null +# summary_type :integer default("complete"), not null +# origin :integer +# highest_target_number :integer not null # # Indexes # diff --git a/app/serializers/ai_topic_summary_serializer.rb b/app/serializers/ai_topic_summary_serializer.rb index 738f843e..7ffff9e2 100644 --- a/app/serializers/ai_topic_summary_serializer.rb +++ b/app/serializers/ai_topic_summary_serializer.rb @@ -13,15 +13,6 @@ class AiTopicSummarySerializer < ApplicationSerializer end def new_posts_since_summary - # Postgres uses discrete range types for int4range, which means - # (1..2) is stored as (1...3). - # - # We use Range#max to work around this, which in the case above always returns 2. - # Be careful with using Range#end here, it could lead to unexpected results as: - # - # (1..2).end => 2 - # (1...3).end => 3 - - object.target.highest_post_number.to_i - object.content_range&.max.to_i + object.target.highest_post_number.to_i - object.highest_target_number.to_i end end diff --git a/db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb b/db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb new file mode 100644 index 00000000..9a208a3e --- /dev/null +++ b/db/migrate/20250115173456_add_highest_target_number_to_ai_summary.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true +class AddHighestTargetNumberToAiSummary < ActiveRecord::Migration[7.2] + def up + add_column :ai_summaries, :highest_target_number, :integer, null: false + + execute <<~SQL + UPDATE ai_summaries SET highest_target_number = GREATEST(UPPER(content_range) - 1, 1) + SQL + end + + def down + drop_column :ai_summaries, :highest_target_number + end +end diff --git a/db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb b/db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb new file mode 100644 index 00000000..726b0858 --- /dev/null +++ b/db/post_migrate/20250115181147_drop_ai_summaries_content_range.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true +class DropAiSummariesContentRange < ActiveRecord::Migration[7.2] + DROPPED_COLUMNS ||= { ai_summaries: %i[content_range] } + + def up + DROPPED_COLUMNS.each { |table, columns| Migration::ColumnDropper.execute_drop(table, columns) } + end + + def down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb index 32bf9779..831d311e 100644 --- a/lib/summarization/fold_content.rb +++ b/lib/summarization/fold_content.rb @@ -21,6 +21,8 @@ module DiscourseAi # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function. # Note: The block is only called with results of the final summary, not intermediate summaries. # + # This method doesn't care if we already have an up to date summary. It always regenerate. + # # @returns { AiSummary } - Resulting summary. def summarize(user, &on_partial_blk) base_summary = "" @@ -68,11 +70,6 @@ module DiscourseAi AiSummary.where(target: strategy.target, summary_type: strategy.type).destroy_all end - def force_summarize(user, &on_partial_blk) - @persist_summaries = true - summarize(user, &on_partial_blk) - end - private attr_reader :persist_summaries diff --git a/lib/summarization/strategies/chat_messages.rb b/lib/summarization/strategies/chat_messages.rb index 1f3aad6d..2f073d71 100644 --- a/lib/summarization/strategies/chat_messages.rb +++ b/lib/summarization/strategies/chat_messages.rb @@ -8,6 +8,10 @@ module DiscourseAi AiSummary.summary_types[:complete] end + def highest_target_number + nil # We don't persist so we can return nil. + end + def initialize(target, since) super(target) @since = since diff --git a/lib/summarization/strategies/hot_topic_gists.rb b/lib/summarization/strategies/hot_topic_gists.rb index 1708f2b8..b3e88876 100644 --- a/lib/summarization/strategies/hot_topic_gists.rb +++ b/lib/summarization/strategies/hot_topic_gists.rb @@ -12,6 +12,10 @@ module DiscourseAi "gists" end + def highest_target_number + target.highest_post_number + end + def targets_data op_post_number = 1 @@ -38,6 +42,7 @@ module DiscourseAi .limit(20) .pluck(:post_number) end + posts_data = Post .where(topic_id: target.id) diff --git a/lib/summarization/strategies/topic_summary.rb b/lib/summarization/strategies/topic_summary.rb index c252fe33..e97c391a 100644 --- a/lib/summarization/strategies/topic_summary.rb +++ b/lib/summarization/strategies/topic_summary.rb @@ -8,6 +8,10 @@ module DiscourseAi AiSummary.summary_types[:complete] end + def highest_target_number + target.highest_post_number + end + def targets_data posts_data = (target.has_summary? ? best_replies : pick_selection).pluck( diff --git a/spec/fabricators/ai_summary_fabricator.rb b/spec/fabricators/ai_summary_fabricator.rb index 250ae943..d596dbab 100644 --- a/spec/fabricators/ai_summary_fabricator.rb +++ b/spec/fabricators/ai_summary_fabricator.rb @@ -7,6 +7,7 @@ Fabricator(:ai_summary) do target { Fabricate(:topic) } summary_type AiSummary.summary_types[:complete] origin AiSummary.origins[:human] + highest_target_number 1 end Fabricator(:topic_ai_gist, from: :ai_summary) do diff --git a/spec/jobs/scheduled/summaries_backfill_spec.rb b/spec/jobs/scheduled/summaries_backfill_spec.rb index 2764055d..401f4d20 100644 --- a/spec/jobs/scheduled/summaries_backfill_spec.rb +++ b/spec/jobs/scheduled/summaries_backfill_spec.rb @@ -44,13 +44,13 @@ RSpec.describe Jobs::SummariesBackfill do end it "ignores up to date summaries" do - Fabricate(:ai_summary, target: topic, content_range: (1..2)) + Fabricate(:ai_summary, target: topic, highest_target_number: 2, updated_at: 10.minutes.ago) expect(subject.backfill_candidates(type)).to be_empty end - it "ignores outdated summaries created less than five minutes ago" do - Fabricate(:ai_summary, target: topic, content_range: (1..1), created_at: 4.minutes.ago) + it "ignores outdated summaries updated less than five minutes ago" do + Fabricate(:ai_summary, target: topic, highest_target_number: 1, updated_at: 4.minutes.ago) expect(subject.backfill_candidates(type)).to be_empty end @@ -66,7 +66,7 @@ RSpec.describe Jobs::SummariesBackfill do topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1) topic.update!(last_posted_at: 1.minute.ago) - Fabricate(:ai_summary, target: topic, created_at: 1.hour.ago, content_range: (1..1)) + Fabricate(:ai_summary, target: topic, updated_at: 1.hour.ago, highest_target_number: 1) expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id) end @@ -84,13 +84,13 @@ RSpec.describe Jobs::SummariesBackfill do topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1) topic.update!(last_posted_at: 1.minute.ago) - Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1)) - Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1)) + Fabricate(:ai_summary, target: topic, updated_at: 3.hours.ago, highest_target_number: 1) + Fabricate(:topic_ai_gist, target: topic, updated_at: 3.hours.ago, highest_target_number: 1) summary_1 = "Summary of topic_2" gist_1 = "Gist of topic_2" - summary_2 = "Summary of topic" - gist_2 = "Gist of topic" + summary_2 = "Updated summary of topic" + gist_2 = "Updated gist of topic" DiscourseAi::Completions::Llm.with_prepared_responses( [gist_1, gist_2, summary_1, summary_2], @@ -100,6 +100,32 @@ RSpec.describe Jobs::SummariesBackfill do expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1) expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2) expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2) + + # Queue has to be empty if we just generated all summaries + expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty + expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty + + # Queue still empty when they are up to date and time passes. + AiSummary.update_all(updated_at: 20.minutes.ago) + expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty + expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty + end + + it "updates the highest_target_number if the summary turned to be up to date" do + existing_summary = + Fabricate( + :ai_summary, + target: topic, + updated_at: 3.hours.ago, + highest_target_number: topic.highest_post_number, + ) + og_highest_post_number = topic.highest_post_number + topic.update!(highest_post_number: og_highest_post_number + 1) + + # No prepared responses here. We don't perform a completion call. + subject.execute({}) + + expect(existing_summary.reload.highest_target_number).to eq(og_highest_post_number + 1) end end end diff --git a/spec/lib/guardian_extensions_spec.rb b/spec/lib/guardian_extensions_spec.rb index 589b362a..7e3bb6f0 100644 --- a/spec/lib/guardian_extensions_spec.rb +++ b/spec/lib/guardian_extensions_spec.rb @@ -26,13 +26,7 @@ describe DiscourseAi::GuardianExtensions do end it "returns true if there is a cached summary" do - AiSummary.create!( - target: topic, - summarized_text: "test", - original_content_sha: "123", - algorithm: "test", - summary_type: AiSummary.summary_types[:complete], - ) + Fabricate(:ai_summary, target: topic) expect(guardian.can_see_summary?(topic)).to eq(true) end @@ -66,13 +60,7 @@ describe DiscourseAi::GuardianExtensions do end it "returns true for anons when there is a cached summary" do - AiSummary.create!( - target: topic, - summarized_text: "test", - original_content_sha: "123", - algorithm: "test", - summary_type: AiSummary.summary_types[:complete], - ) + Fabricate(:ai_summary, target: topic) expect(guardian.can_see_summary?(topic)).to eq(true) end diff --git a/spec/requests/summarization/summary_controller_spec.rb b/spec/requests/summarization/summary_controller_spec.rb index 493d7c85..01ab28bd 100644 --- a/spec/requests/summarization/summary_controller_spec.rb +++ b/spec/requests/summarization/summary_controller_spec.rb @@ -13,15 +13,7 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do context "when streaming" do it "return a cached summary with json payload and does not trigger job if it exists" do - section = - AiSummary.create!( - target: topic, - summarized_text: "test", - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - + summary = Fabricate(:ai_summary, target: topic) sign_in(Fabricate(:admin)) get "/discourse-ai/summarization/t/#{topic.id}.json?stream=true" @@ -29,8 +21,10 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do expect(response.status).to eq(200) expect(Jobs::StreamTopicAiSummary.jobs.size).to eq(0) - summary = response.parsed_body - expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text) + response_summary = response.parsed_body + expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq( + summary.summarized_text, + ) end end @@ -42,21 +36,15 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do end it "returns a cached summary" do - section = - AiSummary.create!( - target: topic, - summarized_text: "test", - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - + summary = Fabricate(:ai_summary, target: topic) get "/discourse-ai/summarization/t/#{topic.id}.json" expect(response.status).to eq(200) - summary = response.parsed_body - expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text) + response_summary = response.parsed_body + expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq( + summary.summarized_text, + ) end end @@ -90,15 +78,15 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do get "/discourse-ai/summarization/t/#{topic.id}.json" expect(response.status).to eq(200) - summary = response.parsed_body["ai_topic_summary"] - section = AiSummary.last + response_summary = response.parsed_body["ai_topic_summary"] + summary = AiSummary.last - expect(section.summarized_text).to eq(summary_text) - expect(summary["summarized_text"]).to eq(section.summarized_text) - expect(summary["algorithm"]).to eq("fake") - expect(summary["outdated"]).to eq(false) - expect(summary["can_regenerate"]).to eq(true) - expect(summary["new_posts_since_summary"]).to be_zero + expect(summary.summarized_text).to eq(summary_text) + expect(response_summary["summarized_text"]).to eq(summary.summarized_text) + expect(response_summary["algorithm"]).to eq("fake") + expect(response_summary["outdated"]).to eq(false) + expect(response_summary["can_regenerate"]).to eq(true) + expect(response_summary["new_posts_since_summary"]).to be_zero end end @@ -129,21 +117,16 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do end it "returns a cached summary" do - section = - AiSummary.create!( - target: topic, - summarized_text: "test", - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) + summary = Fabricate(:ai_summary, target: topic) get "/discourse-ai/summarization/t/#{topic.id}.json" expect(response.status).to eq(200) - summary = response.parsed_body - expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text) + response_summary = response.parsed_body + expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq( + summary.summarized_text, + ) end end end diff --git a/spec/services/discourse_ai/topic_summarization_spec.rb b/spec/services/discourse_ai/topic_summarization_spec.rb index eca5a13e..a75dd527 100644 --- a/spec/services/discourse_ai/topic_summarization_spec.rb +++ b/spec/services/discourse_ai/topic_summarization_spec.rb @@ -20,7 +20,7 @@ describe DiscourseAi::TopicSummarization do cached_summary = AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete]) - expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number)) + expect(cached_summary.highest_target_number).to eq(topic.highest_post_number) expect(cached_summary.summarized_text).to eq(summary) expect(cached_summary.original_content_sha).to be_present expect(cached_summary.algorithm).to eq("fake") diff --git a/spec/system/summarization/topic_summarization_spec.rb b/spec/system/summarization/topic_summarization_spec.rb index 36caea2e..45ffd7a6 100644 --- a/spec/system/summarization/topic_summarization_spec.rb +++ b/spec/system/summarization/topic_summarization_spec.rb @@ -16,6 +16,8 @@ RSpec.describe "Summarize a topic ", type: :system do let(:topic_page) { PageObjects::Pages::Topic.new } let(:summary_box) { PageObjects::Components::AiSummaryTrigger.new } + fab!(:ai_summary) { Fabricate(:ai_summary, target: topic, summarized_text: "This is a summary") } + before do group.add(current_user) @@ -27,16 +29,6 @@ RSpec.describe "Summarize a topic ", type: :system do end context "when a summary is cached" do - before do - AiSummary.create!( - target: topic, - summarized_text: summarization_result, - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - end - it "displays it" do topic_page.visit_topic(topic) summary_box.click_summarize @@ -45,15 +37,6 @@ RSpec.describe "Summarize a topic ", type: :system do end context "when a summary is outdated" do - before do - AiSummary.create!( - target: topic, - summarized_text: summarization_result, - algorithm: "test", - original_content_sha: "test", - summary_type: AiSummary.summary_types[:complete], - ) - end fab!(:new_post) do Fabricate( :post,