FEATURE: Automatically backfill regular summaries. (#892)

This change introduces a job to summarize topics and cache the results automatically. We provide a setting to control how many topics we'll backfill per hour and what the topic's minimum word count is to qualify. We'll prioritize topics without summary over outdated ones.
2024-11-04 17:48:11 -03:00 · 2024-11-04 17:48:11 -03:00 · 9505a8976c
parent 98022d7d96
commit 9505a8976c
10 changed files with 191 additions and 9 deletions
--- a/app/jobs/scheduled/summaries_backfill.rb
+++ b/app/jobs/scheduled/summaries_backfill.rb
@ -0,0 +1,43 @@
+# frozen_string_literal: true
+
+module ::Jobs
+  class SummariesBackfill < ::Jobs::Scheduled
+    every 5.minutes
+    cluster_concurrency 1
+
+    def execute(_args)
+      return if !SiteSetting.discourse_ai_enabled
+      return if !SiteSetting.ai_summarization_enabled
+      return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero?
+
+      # Split budget in 12 intervals, but make sure is at least one.
+      limit_per_job = [SiteSetting.ai_summary_backfill_maximum_topics_per_hour, 12].max / 12
+      budget = [current_budget, limit_per_job].min
+
+      backfill_candidates
+        .limit(budget)
+        .each do |topic|
+          DiscourseAi::Summarization.topic_summary(topic).force_summarize(Discourse.system_user)
+        end
+    end
+
+    def backfill_candidates
+      Topic
+        .where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count)
+        .joins(
+          "LEFT OUTER JOIN ai_summaries ais ON topics.id = ais.target_id AND ais.target_type = 'Topic'",
+        )
+        .where(
+          "ais.id IS NULL OR UPPER(ais.content_range) < topics.highest_post_number + 1",
+        ) # (1..1) gets stored ad (1..2).
+        .order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
+    end
+
+    def current_budget
+      base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour
+      used_budget = AiSummary.complete.system.where("created_at > ?", 1.hour.ago).count
+
+      base_budget - used_budget
+    end
+  end
+end
--- a/app/models/ai_summary.rb
+++ b/app/models/ai_summary.rb
@ -4,15 +4,19 @@ class AiSummary < ActiveRecord::Base
  belongs_to :target, polymorphic: true

  enum :summary_type, { complete: 0, gist: 1 }
+  enum :origin, { human: 0, system: 1 }
+
+  def self.store!(strategy, llm_model, summary, og_content, human:)
+    content_ids = og_content.map { |c| c[:id] }

-  def self.store!(target, summary_type, model, summary, content_ids)
    AiSummary.create!(
-      target: target,
-      algorithm: model,
+      target: strategy.target,
+      algorithm: llm_model.name,
      content_range: (content_ids.first..content_ids.last),
      summarized_text: summary,
      original_content_sha: build_sha(content_ids.join),
-      summary_type: summary_type,
+      summary_type: strategy.type,
+      origin: !!human ? origins[:human] : origins[:system],
    )
  end

@ -43,6 +47,7 @@ end
 #  created_at           :datetime         not null
 #  updated_at           :datetime         not null
 #  summary_type         :integer          default("complete"), not null
+#  origin               :integer
 #
 # Indexes
 #
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -79,12 +79,14 @@ en:
    ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"
    ai_embeddings_semantic_search_hyde_model: "Model used to expand keywords to get better results during a semantic search"
    ai_embeddings_per_post_enabled: Generate embeddings for each post
+    
    ai_summarization_enabled: "Enable the topic summarization module."
    ai_summarization_model: "Model to use for summarization."
    ai_custom_summarization_allowed_groups: "Groups allowed to use create new summaries."
    ai_pm_summarization_allowed_groups: "Groups allowed to create and view summaries in PMs."
    ai_summarize_max_hot_topics_gists_per_batch: "After updating topics in the hot list, we'll generate brief summaries of the first N ones. (Disabled when 0)"
    ai_hot_topic_gists_allowed_groups:  "Groups allowed to see gists in the hot topics list."
+    ai_summary_backfill_maximum_topics_per_hour: "Number of topic summaries to backfill per hour."

    ai_bot_enabled: "Enable the AI Bot module."
    ai_bot_enable_chat_warning: "Display a warning when PM chat is initiated. Can be overriden by editing the translation string: discourse_ai.ai_bot.pm_warning"
--- a/config/settings.yml
+++ b/config/settings.yml
@ -375,6 +375,13 @@ discourse_ai:
    hidden: true
    type: list
    list_type: compact
+  ai_summary_backfill_maximum_topics_per_hour:
+    default: 10
+    min: 0
+    max: 1000
+  ai_summary_backfill_minimum_word_count:
+    default: 200
+    hidden: true

  ai_bot_enabled:
    default: false
--- a/db/migrate/20241031145203_track_ai_summary_origin.rb
+++ b/db/migrate/20241031145203_track_ai_summary_origin.rb
@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class TrackAiSummaryOrigin < ActiveRecord::Migration[7.1]
+  def change
+    add_column :ai_summaries, :origin, :integer
+  end
+end
--- a/db/migrate/20241031180044_set_origin_for_existing_ai_summaries.rb
+++ b/db/migrate/20241031180044_set_origin_for_existing_ai_summaries.rb
@ -0,0 +1,14 @@
+# frozen_string_literal: true
+class SetOriginForExistingAiSummaries < ActiveRecord::Migration[7.1]
+  def up
+    DB.exec <<~SQL
+      UPDATE ai_summaries
+      SET origin = CASE WHEN summary_type = 0 THEN 0 ELSE 1 END
+      WHERE origin IS NULL
+    SQL
+  end
+
+  def down
+    raise ActiveRecord::IrreversibleMigration
+  end
+end
--- a/lib/summarization/fold_content.rb
+++ b/lib/summarization/fold_content.rb
@ -35,11 +35,11 @@ module DiscourseAi

        if persist_summaries
          AiSummary.store!(
-            strategy.target,
-            strategy.type,
-            llm_model.name,
+            strategy,
+            llm_model,
            clean_summary,
-            truncated_content.map { |c| c[:id] },
+            truncated_content,
+            human: user&.human?,
          )
        else
          AiSummary.new(summarized_text: clean_summary)
--- a/spec/fabricators/ai_summary_fabricator.rb
+++ b/spec/fabricators/ai_summary_fabricator.rb
@ -6,9 +6,11 @@ Fabricator(:ai_summary) do
  algorithm "test"
  target { Fabricate(:topic) }
  summary_type AiSummary.summary_types[:complete]
+  origin AiSummary.origins[:human]
 end

 Fabricator(:topic_ai_gist, from: :ai_summary) do
  summarized_text "gist"
  summary_type AiSummary.summary_types[:gist]
+  origin AiSummary.origins[:system]
 end
--- a/spec/jobs/scheduled/summaries_backfill_spec.rb
+++ b/spec/jobs/scheduled/summaries_backfill_spec.rb
@ -0,0 +1,103 @@
+# frozen_string_literal: true
+
+RSpec.describe Jobs::SummariesBackfill do
+  fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) }
+  let(:limit) { 24 } # guarantee two summaries per batch
+
+  before do
+    assign_fake_provider_to(:ai_summarization_model)
+    SiteSetting.ai_summarization_enabled = true
+    SiteSetting.ai_summary_backfill_maximum_topics_per_hour = limit
+  end
+
+  describe "#current_budget" do
+    context "when no summary has been backfilled yet" do
+      it "returns the full budget" do
+        expect(subject.current_budget).to eq(limit)
+      end
+
+      it "ignores summaries generated by users" do
+        Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:human])
+
+        expect(subject.current_budget).to eq(limit)
+      end
+
+      it "only accounts for complete type summaries" do
+        Fabricate(:topic_ai_gist, target: topic, origin: AiSummary.origins[:human])
+
+        expect(subject.current_budget).to eq(limit)
+      end
+    end
+
+    context "when we already backfilled stuff" do
+      fab!(:backfilled_summary) do
+        Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:system])
+      end
+
+      context "if it was within the budget window" do
+        it "reduces our budget" do
+          expect(subject.current_budget).to eq(limit - 1)
+        end
+      end
+
+      context "if it wasn't within the budget window" do
+        before { freeze_time(2.hours.from_now) }
+
+        it "returns the full budget" do
+          freeze_time(2.hours.from_now)
+
+          expect(subject.current_budget).to eq(limit)
+        end
+      end
+    end
+  end
+
+  describe "#backfill_candidates" do
+    it "only selects posts with enough words" do
+      topic.update!(word_count: 100)
+
+      expect(subject.backfill_candidates).to be_empty
+    end
+
+    it "ignores up to date summaries" do
+      Fabricate(:ai_summary, target: topic, content_range: (1..2))
+
+      expect(subject.backfill_candidates).to be_empty
+    end
+
+    it "orders candidates by topic#last_posted_at" do
+      topic.update!(last_posted_at: 1.minute.ago)
+      topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago)
+
+      expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic.id, topic_2.id)
+    end
+
+    it "prioritizes topics without summaries" do
+      topic_2 =
+        Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
+      topic.update!(last_posted_at: 1.minute.ago)
+      Fabricate(:ai_summary, target: topic, content_range: (1..1))
+
+      expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic_2.id, topic.id)
+    end
+  end
+
+  describe "#execute" do
+    it "backfills a batch" do
+      topic_2 =
+        Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
+      topic.update!(last_posted_at: 1.minute.ago)
+      Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
+
+      summary_1 = "Summary of topic_2"
+      summary_2 = "Summary of topic"
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([summary_1, summary_2]) do
+        subject.execute({})
+      end
+
+      expect(AiSummary.find_by(target: topic_2).summarized_text).to eq(summary_1)
+      expect(AiSummary.find_by(target: topic).summarized_text).to eq(summary_2)
+    end
+  end
+end
--- a/spec/lib/modules/summarization/fold_content_spec.rb
+++ b/spec/lib/modules/summarization/fold_content_spec.rb
@ -26,7 +26,7 @@ RSpec.describe DiscourseAi::Summarization::FoldContent do
    let(:single_summary) { "single" }
    let(:concatenated_summary) { "this is a concatenated summary" }

-    let(:user) { User.new }
+    fab!(:user)

    context "when the content to summarize fits in a single call" do
      it "does one call to summarize content" do