FEATURE: Extend summary backfill to also generate gists (#896)

Updates default batch size to 0 and max to 10000
This commit is contained in:
Roman Rizzi 2024-11-07 13:40:18 -03:00 committed by GitHub
parent c421f713a3
commit fbc74c7467
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 52 additions and 50 deletions

View File

@ -10,34 +10,48 @@ module ::Jobs
return if !SiteSetting.ai_summarization_enabled return if !SiteSetting.ai_summarization_enabled
return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero? return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero?
# Split budget in 12 intervals, but make sure is at least one. system_user = Discourse.system_user
limit_per_job = [SiteSetting.ai_summary_backfill_maximum_topics_per_hour, 12].max / 12
budget = [current_budget, limit_per_job].min
backfill_candidates complete_t = AiSummary.summary_types[:complete]
.limit(budget) backfill_candidates(complete_t)
.limit(current_budget(complete_t))
.each do |topic| .each do |topic|
DiscourseAi::Summarization.topic_summary(topic).force_summarize(Discourse.system_user) DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user)
end end
gist_t = AiSummary.summary_types[:gist]
backfill_candidates(gist_t)
.limit(current_budget(gist_t))
.each { |topic| DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user) }
end end
def backfill_candidates def backfill_candidates(summary_type)
Topic Topic
.where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count) .where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count)
.joins( .joins(<<~SQL)
"LEFT OUTER JOIN ai_summaries ais ON topics.id = ais.target_id AND ais.target_type = 'Topic'", LEFT OUTER JOIN ai_summaries ais ON
) topics.id = ais.target_id AND
ais.target_type = 'Topic' AND
ais.summary_type = '#{summary_type}'
SQL
.where( .where(
"ais.id IS NULL OR UPPER(ais.content_range) < topics.highest_post_number + 1", "ais.id IS NULL OR UPPER(ais.content_range) < topics.highest_post_number + 1",
) # (1..1) gets stored ad (1..2). ) # (1..1) gets stored ad (1..2).
.order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC") .order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
end end
def current_budget def current_budget(type)
# Split budget in 12 intervals, but make sure is at least one.
base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour
used_budget = AiSummary.complete.system.where("created_at > ?", 1.hour.ago).count limit_per_job = [base_budget, 12].max / 12
base_budget - used_budget used_budget =
AiSummary.system.where("created_at > ?", 1.hour.ago).where(summary_type: type).count
current_budget = [(base_budget - used_budget), limit_per_job].min
return 0 if current_budget < 0
current_budget
end end
end end
end end

View File

@ -376,9 +376,9 @@ discourse_ai:
type: list type: list
list_type: compact list_type: compact
ai_summary_backfill_maximum_topics_per_hour: ai_summary_backfill_maximum_topics_per_hour:
default: 10 default: 0
min: 0 min: 0
max: 1000 max: 10000
ai_summary_backfill_minimum_word_count: ai_summary_backfill_minimum_word_count:
default: 200 default: 200
hidden: true hidden: true

View File

@ -3,6 +3,7 @@
RSpec.describe Jobs::SummariesBackfill do RSpec.describe Jobs::SummariesBackfill do
fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) } fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) }
let(:limit) { 24 } # guarantee two summaries per batch let(:limit) { 24 } # guarantee two summaries per batch
let(:intervals) { 12 } # budget is split into intervals. Job runs every five minutes.
before do before do
assign_fake_provider_to(:ai_summarization_model) assign_fake_provider_to(:ai_summarization_model)
@ -11,65 +12,47 @@ RSpec.describe Jobs::SummariesBackfill do
end end
describe "#current_budget" do describe "#current_budget" do
let(:type) { AiSummary.summary_types[:complete] }
context "when no summary has been backfilled yet" do context "when no summary has been backfilled yet" do
it "returns the full budget" do it "returns the full budget" do
expect(subject.current_budget).to eq(limit) expect(subject.current_budget(type)).to eq(limit / intervals)
end end
it "ignores summaries generated by users" do it "ignores summaries generated by users" do
Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:human]) Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:human])
expect(subject.current_budget).to eq(limit) expect(subject.current_budget(type)).to eq(limit / intervals)
end end
it "only accounts for complete type summaries" do it "only accounts for summaries of the given type" do
Fabricate(:topic_ai_gist, target: topic, origin: AiSummary.origins[:human]) Fabricate(:topic_ai_gist, target: topic, origin: AiSummary.origins[:human])
expect(subject.current_budget).to eq(limit) expect(subject.current_budget(type)).to eq(limit / intervals)
end
end
context "when we already backfilled stuff" do
fab!(:backfilled_summary) do
Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:system])
end
context "if it was within the budget window" do
it "reduces our budget" do
expect(subject.current_budget).to eq(limit - 1)
end
end
context "if it wasn't within the budget window" do
before { freeze_time(2.hours.from_now) }
it "returns the full budget" do
freeze_time(2.hours.from_now)
expect(subject.current_budget).to eq(limit)
end
end end
end end
end end
describe "#backfill_candidates" do describe "#backfill_candidates" do
let(:type) { AiSummary.summary_types[:complete] }
it "only selects posts with enough words" do it "only selects posts with enough words" do
topic.update!(word_count: 100) topic.update!(word_count: 100)
expect(subject.backfill_candidates).to be_empty expect(subject.backfill_candidates(type)).to be_empty
end end
it "ignores up to date summaries" do it "ignores up to date summaries" do
Fabricate(:ai_summary, target: topic, content_range: (1..2)) Fabricate(:ai_summary, target: topic, content_range: (1..2))
expect(subject.backfill_candidates).to be_empty expect(subject.backfill_candidates(type)).to be_empty
end end
it "orders candidates by topic#last_posted_at" do it "orders candidates by topic#last_posted_at" do
topic.update!(last_posted_at: 1.minute.ago) topic.update!(last_posted_at: 1.minute.ago)
topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago) topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago)
expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic.id, topic_2.id) expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic.id, topic_2.id)
end end
it "prioritizes topics without summaries" do it "prioritizes topics without summaries" do
@ -78,7 +61,7 @@ RSpec.describe Jobs::SummariesBackfill do
topic.update!(last_posted_at: 1.minute.ago) topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, content_range: (1..1)) Fabricate(:ai_summary, target: topic, content_range: (1..1))
expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic_2.id, topic.id) expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id)
end end
end end
@ -88,16 +71,21 @@ RSpec.describe Jobs::SummariesBackfill do
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1) Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
topic.update!(last_posted_at: 1.minute.ago) topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1)) Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1))
summary_1 = "Summary of topic_2" summary_1 = "Summary of topic_2"
gist_1 = "Gist of topic_2"
summary_2 = "Summary of topic" summary_2 = "Summary of topic"
gist_2 = "Gist of topic"
DiscourseAi::Completions::Llm.with_prepared_responses([summary_1, summary_2]) do DiscourseAi::Completions::Llm.with_prepared_responses(
subject.execute({}) [summary_1, summary_2, gist_1, gist_2],
end ) { subject.execute({}) }
expect(AiSummary.find_by(target: topic_2).summarized_text).to eq(summary_1) expect(AiSummary.complete.find_by(target: topic_2).summarized_text).to eq(summary_1)
expect(AiSummary.find_by(target: topic).summarized_text).to eq(summary_2) expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1)
expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2)
expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2)
end end
end end
end end