FEATURE: Extend summary backfill to also generate gists (#896)

Updates default batch size to 0 and max to 10000
2024-11-07 13:40:18 -03:00 · 2024-11-07 13:40:18 -03:00 · fbc74c7467
parent c421f713a3
commit fbc74c7467
3 changed files with 52 additions and 50 deletions
--- a/app/jobs/scheduled/summaries_backfill.rb
+++ b/app/jobs/scheduled/summaries_backfill.rb
@ -10,34 +10,48 @@ module ::Jobs
      return if !SiteSetting.ai_summarization_enabled
      return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero?

-      # Split budget in 12 intervals, but make sure is at least one.
-      limit_per_job = [SiteSetting.ai_summary_backfill_maximum_topics_per_hour, 12].max / 12
-      budget = [current_budget, limit_per_job].min
+      system_user = Discourse.system_user

-      backfill_candidates
-        .limit(budget)
+      complete_t = AiSummary.summary_types[:complete]
+      backfill_candidates(complete_t)
+        .limit(current_budget(complete_t))
        .each do |topic|
-          DiscourseAi::Summarization.topic_summary(topic).force_summarize(Discourse.system_user)
-        end
+          DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user)
        end

-    def backfill_candidates
+      gist_t = AiSummary.summary_types[:gist]
+      backfill_candidates(gist_t)
+        .limit(current_budget(gist_t))
+        .each { |topic| DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user) }
+    end
+
+    def backfill_candidates(summary_type)
      Topic
        .where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count)
-        .joins(
-          "LEFT OUTER JOIN ai_summaries ais ON topics.id = ais.target_id AND ais.target_type = 'Topic'",
-        )
+        .joins(<<~SQL)
+          LEFT OUTER JOIN ai_summaries ais ON 
+                          topics.id = ais.target_id AND 
+                          ais.target_type = 'Topic' AND 
+                          ais.summary_type = '#{summary_type}'
+        SQL
        .where(
          "ais.id IS NULL OR UPPER(ais.content_range) < topics.highest_post_number + 1",
        ) # (1..1) gets stored ad (1..2).
        .order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
    end

-    def current_budget
+    def current_budget(type)
+      # Split budget in 12 intervals, but make sure is at least one.
      base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour
-      used_budget = AiSummary.complete.system.where("created_at > ?", 1.hour.ago).count
+      limit_per_job = [base_budget, 12].max / 12

-      base_budget - used_budget
+      used_budget =
+        AiSummary.system.where("created_at > ?", 1.hour.ago).where(summary_type: type).count
+
+      current_budget = [(base_budget - used_budget), limit_per_job].min
+      return 0 if current_budget < 0
+
+      current_budget
    end
  end
 end
--- a/config/settings.yml
+++ b/config/settings.yml
@ -376,9 +376,9 @@ discourse_ai:
    type: list
    list_type: compact
  ai_summary_backfill_maximum_topics_per_hour:
-    default: 10
+    default: 0
    min: 0
-    max: 1000
+    max: 10000
  ai_summary_backfill_minimum_word_count:
    default: 200
    hidden: true
--- a/spec/jobs/scheduled/summaries_backfill_spec.rb
+++ b/spec/jobs/scheduled/summaries_backfill_spec.rb
@ -3,6 +3,7 @@
 RSpec.describe Jobs::SummariesBackfill do
  fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) }
  let(:limit) { 24 } # guarantee two summaries per batch
+  let(:intervals) { 12 } # budget is split into intervals. Job runs every five minutes.

  before do
    assign_fake_provider_to(:ai_summarization_model)
@ -11,65 +12,47 @@ RSpec.describe Jobs::SummariesBackfill do
  end

  describe "#current_budget" do
+    let(:type) { AiSummary.summary_types[:complete] }
+
    context "when no summary has been backfilled yet" do
      it "returns the full budget" do
-        expect(subject.current_budget).to eq(limit)
+        expect(subject.current_budget(type)).to eq(limit / intervals)
      end

      it "ignores summaries generated by users" do
        Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:human])

-        expect(subject.current_budget).to eq(limit)
+        expect(subject.current_budget(type)).to eq(limit / intervals)
      end

-      it "only accounts for complete type summaries" do
+      it "only accounts for summaries of the given type" do
        Fabricate(:topic_ai_gist, target: topic, origin: AiSummary.origins[:human])

-        expect(subject.current_budget).to eq(limit)
-      end
-    end
-
-    context "when we already backfilled stuff" do
-      fab!(:backfilled_summary) do
-        Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:system])
-      end
-
-      context "if it was within the budget window" do
-        it "reduces our budget" do
-          expect(subject.current_budget).to eq(limit - 1)
-        end
-      end
-
-      context "if it wasn't within the budget window" do
-        before { freeze_time(2.hours.from_now) }
-
-        it "returns the full budget" do
-          freeze_time(2.hours.from_now)
-
-          expect(subject.current_budget).to eq(limit)
-        end
+        expect(subject.current_budget(type)).to eq(limit / intervals)
      end
    end
  end

  describe "#backfill_candidates" do
+    let(:type) { AiSummary.summary_types[:complete] }
+
    it "only selects posts with enough words" do
      topic.update!(word_count: 100)

-      expect(subject.backfill_candidates).to be_empty
+      expect(subject.backfill_candidates(type)).to be_empty
    end

    it "ignores up to date summaries" do
      Fabricate(:ai_summary, target: topic, content_range: (1..2))

-      expect(subject.backfill_candidates).to be_empty
+      expect(subject.backfill_candidates(type)).to be_empty
    end

    it "orders candidates by topic#last_posted_at" do
      topic.update!(last_posted_at: 1.minute.ago)
      topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago)

-      expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic.id, topic_2.id)
+      expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic.id, topic_2.id)
    end

    it "prioritizes topics without summaries" do
@ -78,7 +61,7 @@ RSpec.describe Jobs::SummariesBackfill do
      topic.update!(last_posted_at: 1.minute.ago)
      Fabricate(:ai_summary, target: topic, content_range: (1..1))

-      expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic_2.id, topic.id)
+      expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id)
    end
  end

@ -88,16 +71,21 @@ RSpec.describe Jobs::SummariesBackfill do
        Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
      topic.update!(last_posted_at: 1.minute.ago)
      Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
+      Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1))

      summary_1 = "Summary of topic_2"
+      gist_1 = "Gist of topic_2"
      summary_2 = "Summary of topic"
+      gist_2 = "Gist of topic"

-      DiscourseAi::Completions::Llm.with_prepared_responses([summary_1, summary_2]) do
-        subject.execute({})
-      end
+      DiscourseAi::Completions::Llm.with_prepared_responses(
+        [summary_1, summary_2, gist_1, gist_2],
+      ) { subject.execute({}) }

-      expect(AiSummary.find_by(target: topic_2).summarized_text).to eq(summary_1)
-      expect(AiSummary.find_by(target: topic).summarized_text).to eq(summary_2)
+      expect(AiSummary.complete.find_by(target: topic_2).summarized_text).to eq(summary_1)
+      expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1)
+      expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2)
+      expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2)
    end
  end
 end