FIX: more robust summary implementation (#750)

When navigating between topic we were not correctly resetting
internal state for summarization. This leads to a situation where
incorrect summaries can be displayed to users and wrong summaries
can be displayed.

Additionally our controller for grabbing summaries was always
streaming results via message bus, which could be delayed when
sidekiq is overloaded. We now will return the cached summary
right away if it is available direct from REST endpoint.
This commit is contained in:
Sam 2024-08-13 21:47:47 +10:00 committed by GitHub
parent f72ab12761
commit 14443bf890
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 130 additions and 57 deletions

View File

@ -14,19 +14,32 @@ module DiscourseAi
RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! if current_user RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! if current_user
opts = params.permit(:skip_age_check) opts = params.permit(:skip_age_check)
skip_age_check = opts[:skip_age_check] == "true"
if params[:stream] && current_user if params[:stream] && current_user
cached_summary = DiscourseAi::TopicSummarization.cached_summary(topic, current_user)
if cached_summary && !skip_age_check
render_serialized(cached_summary, AiTopicSummarySerializer)
return
end
Jobs.enqueue( Jobs.enqueue(
:stream_topic_ai_summary, :stream_topic_ai_summary,
topic_id: topic.id, topic_id: topic.id,
user_id: current_user.id, user_id: current_user.id,
opts: opts.as_json, skip_age_check: skip_age_check,
) )
render json: success_json render json: success_json
else else
hijack do hijack do
summary = DiscourseAi::TopicSummarization.summarize(topic, current_user, opts) summary =
DiscourseAi::TopicSummarization.summarize(
topic,
current_user,
skip_age_check: skip_age_check,
)
render_serialized(summary, AiTopicSummarySerializer) render_serialized(summary, AiTopicSummarySerializer)
end end
end end

View File

@ -14,15 +14,15 @@ module Jobs
guardian = Guardian.new(user) guardian = Guardian.new(user)
return unless guardian.can_see?(topic) return unless guardian.can_see?(topic)
opts = args[:opts] || {} skip_age_check = !!args[:skip_age_check]
streamed_summary = +"" streamed_summary = +""
start = Time.now start = Time.now
summary = summary =
DiscourseAi::TopicSummarization DiscourseAi::TopicSummarization
.new(strategy) .new(strategy, topic, user)
.summarize(topic, user, opts) do |partial_summary| .summarize(skip_age_check: skip_age_check) do |partial_summary|
streamed_summary << partial_summary streamed_summary << partial_summary
# Throttle updates. # Throttle updates.

View File

@ -2,33 +2,36 @@
module DiscourseAi module DiscourseAi
class TopicSummarization class TopicSummarization
def self.summarize(topic, user, opts = {}, &on_partial_blk) def self.summarize(topic, user, skip_age_check: false, &on_partial_blk)
new(DiscourseAi::Summarization.default_strategy).summarize(topic, user, opts, &on_partial_blk) new(DiscourseAi::Summarization.default_strategy, topic, user).summarize(
skip_age_check: skip_age_check,
&on_partial_blk
)
end end
def initialize(strategy) def self.cached_summary(topic, user)
new(DiscourseAi::Summarization.default_strategy, topic, user).cached_summary
end
def initialize(strategy, topic, user)
@strategy = strategy @strategy = strategy
@topic = topic
@user = user
end end
def summarize(topic, user, opts = {}, &on_partial_blk) attr_reader :strategy, :topic, :user
existing_summary = AiSummary.find_by(target: topic)
def cached_summary
existing_summary
end
def summarize(skip_age_check: false, &on_partial_blk)
# Existing summary shouldn't be nil in this scenario because the controller checks its existence. # Existing summary shouldn't be nil in this scenario because the controller checks its existence.
return if !user && !existing_summary return if !user && !existing_summary
targets_data = summary_targets(topic).pluck(:post_number, :raw, :username) return existing_summary if use_cached?(skip_age_check)
current_topic_sha = build_sha(targets_data.map(&:first)) delete_cached_summaries! if existing_summary
can_summarize = Guardian.new(user).can_request_summary?
if use_cached?(existing_summary, can_summarize, current_topic_sha, !!opts[:skip_age_check])
# It's important that we signal a cached summary is outdated
existing_summary.mark_as_outdated if new_targets?(existing_summary, current_topic_sha)
return existing_summary
end
delete_cached_summaries_of(topic) if existing_summary
content = { content = {
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}", resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
@ -36,7 +39,7 @@ module DiscourseAi
contents: [], contents: [],
} }
targets_data.map do |(pn, raw, username)| summary_targets_data.map do |(pn, raw, username)|
raw_text = raw raw_text = raw
if pn == 1 && topic.topic_embed&.embed_content_cache.present? if pn == 1 && topic.topic_embed&.embed_content_cache.present?
@ -47,19 +50,34 @@ module DiscourseAi
end end
summarization_result = strategy.summarize(content, user, &on_partial_blk) summarization_result = strategy.summarize(content, user, &on_partial_blk)
cache_summary(summarization_result)
cache_summary(summarization_result, targets_data.map(&:first), topic)
end end
def summary_targets(topic) def summary_targets
topic.has_summary? ? best_replies(topic) : pick_selection(topic) topic.has_summary? ? best_replies : pick_selection
end end
private private
attr_reader :strategy def summary_sha
@summary_sha ||= build_sha(summary_targets_data.map(&:first))
end
def best_replies(topic) def summary_targets_data
@summary_targets_data ||= summary_targets.pluck(:post_number, :raw, :username)
end
def existing_summary
if !defined?(@existing_summary)
@existing_summary = AiSummary.find_by(target: topic)
if @existing_summary && existing_summary.original_content_sha != summary_sha
@existing_summary.mark_as_outdated
end
end
@existing_summary
end
def best_replies
Post Post
.summary(topic.id) .summary(topic.id)
.where("post_type = ?", Post.types[:regular]) .where("post_type = ?", Post.types[:regular])
@ -68,7 +86,7 @@ module DiscourseAi
.order(:post_number) .order(:post_number)
end end
def pick_selection(topic) def pick_selection
posts = posts =
Post Post
.where(topic_id: topic.id) .where(topic_id: topic.id)
@ -87,31 +105,34 @@ module DiscourseAi
.order(:post_number) .order(:post_number)
end end
def delete_cached_summaries_of(topic) def delete_cached_summaries!
AiSummary.where(target: topic).destroy_all AiSummary.where(target: topic).destroy_all
end end
# For users without permissions to generate a summary or fresh summaries, we return what we have cached. def use_cached?(skip_age_check)
def use_cached?(existing_summary, can_summarize, current_sha, skip_age_check) can_summarize = Guardian.new(user).can_request_summary?
existing_summary && existing_summary &&
!( !(
can_summarize && new_targets?(existing_summary, current_sha) && can_summarize && new_targets? &&
(skip_age_check || existing_summary.created_at < 1.hour.ago) (skip_age_check || existing_summary.created_at < 1.hour.ago)
) )
end end
def new_targets?(summary, current_sha) def new_targets?
summary.original_content_sha != current_sha existing_summary&.original_content_sha != summary_sha
end end
def cache_summary(result, post_numbers, topic) def cache_summary(result)
post_numbers = summary_targets_data.map(&:first)
cached_summary = cached_summary =
AiSummary.create!( AiSummary.create!(
target: topic, target: topic,
algorithm: strategy.display_name, algorithm: strategy.display_name,
content_range: (post_numbers.first..post_numbers.last), content_range: (post_numbers.first..post_numbers.last),
summarized_text: result[:summary], summarized_text: result[:summary],
original_content_sha: build_sha(post_numbers), original_content_sha: summary_sha,
) )
cached_summary cached_summary

View File

@ -3,6 +3,7 @@ import { tracked } from "@glimmer/tracking";
import { array } from "@ember/helper"; import { array } from "@ember/helper";
import { action } from "@ember/object"; import { action } from "@ember/object";
import didInsert from "@ember/render-modifiers/modifiers/did-insert"; import didInsert from "@ember/render-modifiers/modifiers/did-insert";
import didUpdate from "@ember/render-modifiers/modifiers/did-update";
import willDestroy from "@ember/render-modifiers/modifiers/will-destroy"; import willDestroy from "@ember/render-modifiers/modifiers/will-destroy";
import { service } from "@ember/service"; import { service } from "@ember/service";
import DButton from "discourse/components/d-button"; import DButton from "discourse/components/d-button";
@ -44,6 +45,17 @@ export default class AiSummaryBox extends Component {
return outdatedText; return outdatedText;
} }
resetSummary() {
this.text = "";
this.summarizedOn = null;
this.summarizedBy = null;
this.newPostsSinceSummary = null;
this.outdated = false;
this.canRegenerate = false;
this.loading = false;
this._channel = null;
}
get topRepliesSummaryEnabled() { get topRepliesSummaryEnabled() {
return this.args.outletArgs.postStream.summary; return this.args.outletArgs.postStream.summary;
} }
@ -57,8 +69,12 @@ export default class AiSummaryBox extends Component {
} }
@bind @bind
subscribe() { subscribe(unsubscribe) {
if (unsubscribe && this._channel) {
this.unsubscribe();
}
const channel = `/discourse-ai/summaries/topic/${this.args.outletArgs.topic.id}`; const channel = `/discourse-ai/summaries/topic/${this.args.outletArgs.topic.id}`;
this._channel = channel;
this.messageBus.subscribe(channel, this._updateSummary); this.messageBus.subscribe(channel, this._updateSummary);
} }
@ -68,6 +84,7 @@ export default class AiSummaryBox extends Component {
"/discourse-ai/summaries/topic/*", "/discourse-ai/summaries/topic/*",
this._updateSummary this._updateSummary
); );
this.resetSummary();
} }
@action @action
@ -106,7 +123,7 @@ export default class AiSummaryBox extends Component {
this.summarizedOn = null; this.summarizedOn = null;
return ajax(url).then((data) => { return ajax(url).then((data) => {
if (!this.currentUser) { if (data?.ai_topic_summary?.summarized_text) {
data.done = true; data.done = true;
this._updateSummary(data); this._updateSummary(data);
} }
@ -153,6 +170,7 @@ export default class AiSummaryBox extends Component {
<div <div
class="ai-summarization-button" class="ai-summarization-button"
{{didInsert this.subscribe}} {{didInsert this.subscribe}}
{{didUpdate this.subscribe @outletArgs.topic.id}}
{{willDestroy this.unsubscribe}} {{willDestroy this.unsubscribe}}
> >
<DMenu <DMenu

View File

@ -11,6 +11,28 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
SiteSetting.ai_summarization_enabled = true SiteSetting.ai_summarization_enabled = true
end end
context "when streaming" do
it "return a cached summary with json payload and does not trigger job if it exists" do
section =
AiSummary.create!(
target: topic,
summarized_text: "test",
algorithm: "test",
original_content_sha: "test",
)
sign_in(Fabricate(:admin))
get "/discourse-ai/summarization/t/#{topic.id}.json?stream=true"
expect(response.status).to eq(200)
expect(Jobs::StreamTopicAiSummary.jobs.size).to eq(0)
summary = response.parsed_body
expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text)
end
end
context "for anons" do context "for anons" do
it "returns a 404 if there is no cached summary" do it "returns a 404 if there is no cached summary" do
get "/discourse-ai/summarization/t/#{topic.id}.json" get "/discourse-ai/summarization/t/#{topic.id}.json"

View File

@ -14,12 +14,12 @@ describe DiscourseAi::TopicSummarization do
let(:strategy) { DiscourseAi::Summarization.default_strategy } let(:strategy) { DiscourseAi::Summarization.default_strategy }
shared_examples "includes only public-visible topics" do shared_examples "includes only public-visible topics" do
subject { DiscourseAi::TopicSummarization.new(strategy) } subject { DiscourseAi::TopicSummarization.new(strategy, topic, user) }
it "only includes visible posts" do it "only includes visible posts" do
topic.first_post.update!(hidden: true) topic.first_post.update!(hidden: true)
posts = subject.summary_targets(topic) posts = subject.summary_targets
expect(posts.none?(&:hidden?)).to eq(true) expect(posts.none?(&:hidden?)).to eq(true)
end end
@ -27,7 +27,7 @@ describe DiscourseAi::TopicSummarization do
it "doesn't include posts without users" do it "doesn't include posts without users" do
topic.first_post.user.destroy! topic.first_post.user.destroy!
posts = subject.summary_targets(topic) posts = subject.summary_targets
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end end
@ -35,7 +35,7 @@ describe DiscourseAi::TopicSummarization do
it "doesn't include deleted posts" do it "doesn't include deleted posts" do
topic.first_post.update!(user_id: nil) topic.first_post.update!(user_id: nil)
posts = subject.summary_targets(topic) posts = subject.summary_targets
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end end
@ -56,7 +56,7 @@ describe DiscourseAi::TopicSummarization do
end end
describe "#summarize" do describe "#summarize" do
subject(:summarization) { described_class.new(strategy) } subject(:summarization) { described_class.new(strategy, topic, user) }
def assert_summary_is_cached(topic, summary_response) def assert_summary_is_cached(topic, summary_response)
cached_summary = AiSummary.find_by(target: topic) cached_summary = AiSummary.find_by(target: topic)
@ -72,14 +72,14 @@ describe DiscourseAi::TopicSummarization do
it "caches the summary" do it "caches the summary" do
DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
section = summarization.summarize(topic, user) section = summarization.summarize
expect(section.summarized_text).to eq(summary) expect(section.summarized_text).to eq(summary)
assert_summary_is_cached(topic, summary) assert_summary_is_cached(topic, summary)
end end
end end
it "returns the cached version in subsequent calls" do it "returns the cached version in subsequent calls" do
summarization.summarize(topic, user) summarization.summarize
cached_summary_text = "This is a cached summary" cached_summary_text = "This is a cached summary"
AiSummary.find_by(target: topic).update!( AiSummary.find_by(target: topic).update!(
@ -87,7 +87,8 @@ describe DiscourseAi::TopicSummarization do
updated_at: 24.hours.ago, updated_at: 24.hours.ago,
) )
section = summarization.summarize(topic, user) summarization = described_class.new(strategy, topic, user)
section = summarization.summarize
expect(section.summarized_text).to eq(cached_summary_text) expect(section.summarized_text).to eq(cached_summary_text)
end end
@ -101,7 +102,7 @@ describe DiscourseAi::TopicSummarization do
) )
DiscourseAi::Completions::Llm.with_prepared_responses(["A summary"]) do |spy| DiscourseAi::Completions::Llm.with_prepared_responses(["A summary"]) do |spy|
summarization.summarize(topic, user) summarization.summarize
prompt_raw = prompt_raw =
spy spy
@ -133,7 +134,7 @@ describe DiscourseAi::TopicSummarization do
# so we create the cached summary totally independantly # so we create the cached summary totally independantly
DiscourseAi::Completions::Llm.with_prepared_responses([cached_text]) do DiscourseAi::Completions::Llm.with_prepared_responses([cached_text]) do
strategy = DiscourseAi::Summarization.default_strategy strategy = DiscourseAi::Summarization.default_strategy
described_class.new(strategy).summarize(topic, user) described_class.new(strategy, topic, user).summarize
end end
cached_summary.update!(summarized_text: cached_text, created_at: 24.hours.ago) cached_summary.update!(summarized_text: cached_text, created_at: 24.hours.ago)
@ -142,7 +143,7 @@ describe DiscourseAi::TopicSummarization do
context "when the user can requests new summaries" do context "when the user can requests new summaries" do
context "when there are no new posts" do context "when there are no new posts" do
it "returns the cached summary" do it "returns the cached summary" do
section = summarization.summarize(topic, user) section = summarization.summarize
expect(section.summarized_text).to eq(cached_text) expect(section.summarized_text).to eq(cached_text)
end end
@ -153,7 +154,7 @@ describe DiscourseAi::TopicSummarization do
it "returns a new summary" do it "returns a new summary" do
DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
section = summarization.summarize(topic, user) section = summarization.summarize
expect(section.summarized_text).to eq(updated_summary) expect(section.summarized_text).to eq(updated_summary)
end end
@ -165,7 +166,7 @@ describe DiscourseAi::TopicSummarization do
it "returns the cached summary" do it "returns the cached summary" do
cached_summary.update!(created_at: 30.minutes.ago) cached_summary.update!(created_at: 30.minutes.ago)
section = summarization.summarize(topic, user) section = summarization.summarize
expect(section.summarized_text).to eq(cached_text) expect(section.summarized_text).to eq(cached_text)
expect(section.outdated).to eq(true) expect(section.outdated).to eq(true)
@ -173,7 +174,7 @@ describe DiscourseAi::TopicSummarization do
it "returns a new summary if the skip_age_check flag is passed" do it "returns a new summary if the skip_age_check flag is passed" do
DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
section = summarization.summarize(topic, user, skip_age_check: true) section = summarization.summarize(skip_age_check: true)
expect(section.summarized_text).to eq(updated_summary) expect(section.summarized_text).to eq(updated_summary)
end end
@ -190,9 +191,7 @@ describe DiscourseAi::TopicSummarization do
partial_result = +"" partial_result = +""
DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
summarization.summarize(topic, user) do |partial_summary| summarization.summarize { |partial_summary| partial_result << partial_summary }
partial_result << partial_summary
end
end end
expect(partial_result).to eq(summary) expect(partial_result).to eq(summary)