FEATURE: Extend summary backfill to also generate gists (#896)

Updates default batch size to 0 and max to 10000
This commit is contained in:
Roman Rizzi 2024-11-07 13:40:18 -03:00 committed by GitHub
parent c421f713a3
commit fbc74c7467
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 52 additions and 50 deletions

View File

@ -10,34 +10,48 @@ module ::Jobs
return if !SiteSetting.ai_summarization_enabled
return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero?
# Split budget in 12 intervals, but make sure is at least one.
limit_per_job = [SiteSetting.ai_summary_backfill_maximum_topics_per_hour, 12].max / 12
budget = [current_budget, limit_per_job].min
system_user = Discourse.system_user
backfill_candidates
.limit(budget)
complete_t = AiSummary.summary_types[:complete]
backfill_candidates(complete_t)
.limit(current_budget(complete_t))
.each do |topic|
DiscourseAi::Summarization.topic_summary(topic).force_summarize(Discourse.system_user)
end
DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user)
end
def backfill_candidates
gist_t = AiSummary.summary_types[:gist]
backfill_candidates(gist_t)
.limit(current_budget(gist_t))
.each { |topic| DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user) }
end
def backfill_candidates(summary_type)
Topic
.where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count)
.joins(
"LEFT OUTER JOIN ai_summaries ais ON topics.id = ais.target_id AND ais.target_type = 'Topic'",
)
.joins(<<~SQL)
LEFT OUTER JOIN ai_summaries ais ON
topics.id = ais.target_id AND
ais.target_type = 'Topic' AND
ais.summary_type = '#{summary_type}'
SQL
.where(
"ais.id IS NULL OR UPPER(ais.content_range) < topics.highest_post_number + 1",
) # (1..1) gets stored ad (1..2).
.order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
end
def current_budget
def current_budget(type)
# Split budget in 12 intervals, but make sure is at least one.
base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour
used_budget = AiSummary.complete.system.where("created_at > ?", 1.hour.ago).count
limit_per_job = [base_budget, 12].max / 12
base_budget - used_budget
used_budget =
AiSummary.system.where("created_at > ?", 1.hour.ago).where(summary_type: type).count
current_budget = [(base_budget - used_budget), limit_per_job].min
return 0 if current_budget < 0
current_budget
end
end
end

View File

@ -376,9 +376,9 @@ discourse_ai:
type: list
list_type: compact
ai_summary_backfill_maximum_topics_per_hour:
default: 10
default: 0
min: 0
max: 1000
max: 10000
ai_summary_backfill_minimum_word_count:
default: 200
hidden: true

View File

@ -3,6 +3,7 @@
RSpec.describe Jobs::SummariesBackfill do
fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) }
let(:limit) { 24 } # guarantee two summaries per batch
let(:intervals) { 12 } # budget is split into intervals. Job runs every five minutes.
before do
assign_fake_provider_to(:ai_summarization_model)
@ -11,65 +12,47 @@ RSpec.describe Jobs::SummariesBackfill do
end
describe "#current_budget" do
let(:type) { AiSummary.summary_types[:complete] }
context "when no summary has been backfilled yet" do
it "returns the full budget" do
expect(subject.current_budget).to eq(limit)
expect(subject.current_budget(type)).to eq(limit / intervals)
end
it "ignores summaries generated by users" do
Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:human])
expect(subject.current_budget).to eq(limit)
expect(subject.current_budget(type)).to eq(limit / intervals)
end
it "only accounts for complete type summaries" do
it "only accounts for summaries of the given type" do
Fabricate(:topic_ai_gist, target: topic, origin: AiSummary.origins[:human])
expect(subject.current_budget).to eq(limit)
end
end
context "when we already backfilled stuff" do
fab!(:backfilled_summary) do
Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:system])
end
context "if it was within the budget window" do
it "reduces our budget" do
expect(subject.current_budget).to eq(limit - 1)
end
end
context "if it wasn't within the budget window" do
before { freeze_time(2.hours.from_now) }
it "returns the full budget" do
freeze_time(2.hours.from_now)
expect(subject.current_budget).to eq(limit)
end
expect(subject.current_budget(type)).to eq(limit / intervals)
end
end
end
describe "#backfill_candidates" do
let(:type) { AiSummary.summary_types[:complete] }
it "only selects posts with enough words" do
topic.update!(word_count: 100)
expect(subject.backfill_candidates).to be_empty
expect(subject.backfill_candidates(type)).to be_empty
end
it "ignores up to date summaries" do
Fabricate(:ai_summary, target: topic, content_range: (1..2))
expect(subject.backfill_candidates).to be_empty
expect(subject.backfill_candidates(type)).to be_empty
end
it "orders candidates by topic#last_posted_at" do
topic.update!(last_posted_at: 1.minute.ago)
topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago)
expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic.id, topic_2.id)
expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic.id, topic_2.id)
end
it "prioritizes topics without summaries" do
@ -78,7 +61,7 @@ RSpec.describe Jobs::SummariesBackfill do
topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, content_range: (1..1))
expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic_2.id, topic.id)
expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id)
end
end
@ -88,16 +71,21 @@ RSpec.describe Jobs::SummariesBackfill do
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1))
summary_1 = "Summary of topic_2"
gist_1 = "Gist of topic_2"
summary_2 = "Summary of topic"
gist_2 = "Gist of topic"
DiscourseAi::Completions::Llm.with_prepared_responses([summary_1, summary_2]) do
subject.execute({})
end
DiscourseAi::Completions::Llm.with_prepared_responses(
[summary_1, summary_2, gist_1, gist_2],
) { subject.execute({}) }
expect(AiSummary.find_by(target: topic_2).summarized_text).to eq(summary_1)
expect(AiSummary.find_by(target: topic).summarized_text).to eq(summary_2)
expect(AiSummary.complete.find_by(target: topic_2).summarized_text).to eq(summary_1)
expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1)
expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2)
expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2)
end
end
end