FIX: more robust summary implementation (#750)

When navigating between topic we were not correctly resetting
internal state for summarization. This leads to a situation where
incorrect summaries can be displayed to users and wrong summaries
can be displayed.

Additionally our controller for grabbing summaries was always
streaming results via message bus, which could be delayed when
sidekiq is overloaded. We now will return the cached summary
right away if it is available direct from REST endpoint.
This commit is contained in:
Sam 2024-08-13 21:47:47 +10:00 committed by GitHub
parent f72ab12761
commit 14443bf890
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 130 additions and 57 deletions

View File

@ -14,19 +14,32 @@ module DiscourseAi
RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! if current_user
opts = params.permit(:skip_age_check)
skip_age_check = opts[:skip_age_check] == "true"
if params[:stream] && current_user
cached_summary = DiscourseAi::TopicSummarization.cached_summary(topic, current_user)
if cached_summary && !skip_age_check
render_serialized(cached_summary, AiTopicSummarySerializer)
return
end
Jobs.enqueue(
:stream_topic_ai_summary,
topic_id: topic.id,
user_id: current_user.id,
opts: opts.as_json,
skip_age_check: skip_age_check,
)
render json: success_json
else
hijack do
summary = DiscourseAi::TopicSummarization.summarize(topic, current_user, opts)
summary =
DiscourseAi::TopicSummarization.summarize(
topic,
current_user,
skip_age_check: skip_age_check,
)
render_serialized(summary, AiTopicSummarySerializer)
end
end

View File

@ -14,15 +14,15 @@ module Jobs
guardian = Guardian.new(user)
return unless guardian.can_see?(topic)
opts = args[:opts] || {}
skip_age_check = !!args[:skip_age_check]
streamed_summary = +""
start = Time.now
summary =
DiscourseAi::TopicSummarization
.new(strategy)
.summarize(topic, user, opts) do |partial_summary|
.new(strategy, topic, user)
.summarize(skip_age_check: skip_age_check) do |partial_summary|
streamed_summary << partial_summary
# Throttle updates.

View File

@ -2,33 +2,36 @@
module DiscourseAi
class TopicSummarization
def self.summarize(topic, user, opts = {}, &on_partial_blk)
new(DiscourseAi::Summarization.default_strategy).summarize(topic, user, opts, &on_partial_blk)
def self.summarize(topic, user, skip_age_check: false, &on_partial_blk)
new(DiscourseAi::Summarization.default_strategy, topic, user).summarize(
skip_age_check: skip_age_check,
&on_partial_blk
)
end
def initialize(strategy)
def self.cached_summary(topic, user)
new(DiscourseAi::Summarization.default_strategy, topic, user).cached_summary
end
def initialize(strategy, topic, user)
@strategy = strategy
@topic = topic
@user = user
end
def summarize(topic, user, opts = {}, &on_partial_blk)
existing_summary = AiSummary.find_by(target: topic)
attr_reader :strategy, :topic, :user
def cached_summary
existing_summary
end
def summarize(skip_age_check: false, &on_partial_blk)
# Existing summary shouldn't be nil in this scenario because the controller checks its existence.
return if !user && !existing_summary
targets_data = summary_targets(topic).pluck(:post_number, :raw, :username)
return existing_summary if use_cached?(skip_age_check)
current_topic_sha = build_sha(targets_data.map(&:first))
can_summarize = Guardian.new(user).can_request_summary?
if use_cached?(existing_summary, can_summarize, current_topic_sha, !!opts[:skip_age_check])
# It's important that we signal a cached summary is outdated
existing_summary.mark_as_outdated if new_targets?(existing_summary, current_topic_sha)
return existing_summary
end
delete_cached_summaries_of(topic) if existing_summary
delete_cached_summaries! if existing_summary
content = {
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
@ -36,7 +39,7 @@ module DiscourseAi
contents: [],
}
targets_data.map do |(pn, raw, username)|
summary_targets_data.map do |(pn, raw, username)|
raw_text = raw
if pn == 1 && topic.topic_embed&.embed_content_cache.present?
@ -47,19 +50,34 @@ module DiscourseAi
end
summarization_result = strategy.summarize(content, user, &on_partial_blk)
cache_summary(summarization_result, targets_data.map(&:first), topic)
cache_summary(summarization_result)
end
def summary_targets(topic)
topic.has_summary? ? best_replies(topic) : pick_selection(topic)
def summary_targets
topic.has_summary? ? best_replies : pick_selection
end
private
attr_reader :strategy
def summary_sha
@summary_sha ||= build_sha(summary_targets_data.map(&:first))
end
def best_replies(topic)
def summary_targets_data
@summary_targets_data ||= summary_targets.pluck(:post_number, :raw, :username)
end
def existing_summary
if !defined?(@existing_summary)
@existing_summary = AiSummary.find_by(target: topic)
if @existing_summary && existing_summary.original_content_sha != summary_sha
@existing_summary.mark_as_outdated
end
end
@existing_summary
end
def best_replies
Post
.summary(topic.id)
.where("post_type = ?", Post.types[:regular])
@ -68,7 +86,7 @@ module DiscourseAi
.order(:post_number)
end
def pick_selection(topic)
def pick_selection
posts =
Post
.where(topic_id: topic.id)
@ -87,31 +105,34 @@ module DiscourseAi
.order(:post_number)
end
def delete_cached_summaries_of(topic)
def delete_cached_summaries!
AiSummary.where(target: topic).destroy_all
end
# For users without permissions to generate a summary or fresh summaries, we return what we have cached.
def use_cached?(existing_summary, can_summarize, current_sha, skip_age_check)
def use_cached?(skip_age_check)
can_summarize = Guardian.new(user).can_request_summary?
existing_summary &&
!(
can_summarize && new_targets?(existing_summary, current_sha) &&
can_summarize && new_targets? &&
(skip_age_check || existing_summary.created_at < 1.hour.ago)
)
end
def new_targets?(summary, current_sha)
summary.original_content_sha != current_sha
def new_targets?
existing_summary&.original_content_sha != summary_sha
end
def cache_summary(result, post_numbers, topic)
def cache_summary(result)
post_numbers = summary_targets_data.map(&:first)
cached_summary =
AiSummary.create!(
target: topic,
algorithm: strategy.display_name,
content_range: (post_numbers.first..post_numbers.last),
summarized_text: result[:summary],
original_content_sha: build_sha(post_numbers),
original_content_sha: summary_sha,
)
cached_summary

View File

@ -3,6 +3,7 @@ import { tracked } from "@glimmer/tracking";
import { array } from "@ember/helper";
import { action } from "@ember/object";
import didInsert from "@ember/render-modifiers/modifiers/did-insert";
import didUpdate from "@ember/render-modifiers/modifiers/did-update";
import willDestroy from "@ember/render-modifiers/modifiers/will-destroy";
import { service } from "@ember/service";
import DButton from "discourse/components/d-button";
@ -44,6 +45,17 @@ export default class AiSummaryBox extends Component {
return outdatedText;
}
resetSummary() {
this.text = "";
this.summarizedOn = null;
this.summarizedBy = null;
this.newPostsSinceSummary = null;
this.outdated = false;
this.canRegenerate = false;
this.loading = false;
this._channel = null;
}
get topRepliesSummaryEnabled() {
return this.args.outletArgs.postStream.summary;
}
@ -57,8 +69,12 @@ export default class AiSummaryBox extends Component {
}
@bind
subscribe() {
subscribe(unsubscribe) {
if (unsubscribe && this._channel) {
this.unsubscribe();
}
const channel = `/discourse-ai/summaries/topic/${this.args.outletArgs.topic.id}`;
this._channel = channel;
this.messageBus.subscribe(channel, this._updateSummary);
}
@ -68,6 +84,7 @@ export default class AiSummaryBox extends Component {
"/discourse-ai/summaries/topic/*",
this._updateSummary
);
this.resetSummary();
}
@action
@ -106,7 +123,7 @@ export default class AiSummaryBox extends Component {
this.summarizedOn = null;
return ajax(url).then((data) => {
if (!this.currentUser) {
if (data?.ai_topic_summary?.summarized_text) {
data.done = true;
this._updateSummary(data);
}
@ -153,6 +170,7 @@ export default class AiSummaryBox extends Component {
<div
class="ai-summarization-button"
{{didInsert this.subscribe}}
{{didUpdate this.subscribe @outletArgs.topic.id}}
{{willDestroy this.unsubscribe}}
>
<DMenu

View File

@ -11,6 +11,28 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
SiteSetting.ai_summarization_enabled = true
end
context "when streaming" do
it "return a cached summary with json payload and does not trigger job if it exists" do
section =
AiSummary.create!(
target: topic,
summarized_text: "test",
algorithm: "test",
original_content_sha: "test",
)
sign_in(Fabricate(:admin))
get "/discourse-ai/summarization/t/#{topic.id}.json?stream=true"
expect(response.status).to eq(200)
expect(Jobs::StreamTopicAiSummary.jobs.size).to eq(0)
summary = response.parsed_body
expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text)
end
end
context "for anons" do
it "returns a 404 if there is no cached summary" do
get "/discourse-ai/summarization/t/#{topic.id}.json"

View File

@ -14,12 +14,12 @@ describe DiscourseAi::TopicSummarization do
let(:strategy) { DiscourseAi::Summarization.default_strategy }
shared_examples "includes only public-visible topics" do
subject { DiscourseAi::TopicSummarization.new(strategy) }
subject { DiscourseAi::TopicSummarization.new(strategy, topic, user) }
it "only includes visible posts" do
topic.first_post.update!(hidden: true)
posts = subject.summary_targets(topic)
posts = subject.summary_targets
expect(posts.none?(&:hidden?)).to eq(true)
end
@ -27,7 +27,7 @@ describe DiscourseAi::TopicSummarization do
it "doesn't include posts without users" do
topic.first_post.user.destroy!
posts = subject.summary_targets(topic)
posts = subject.summary_targets
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end
@ -35,7 +35,7 @@ describe DiscourseAi::TopicSummarization do
it "doesn't include deleted posts" do
topic.first_post.update!(user_id: nil)
posts = subject.summary_targets(topic)
posts = subject.summary_targets
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end
@ -56,7 +56,7 @@ describe DiscourseAi::TopicSummarization do
end
describe "#summarize" do
subject(:summarization) { described_class.new(strategy) }
subject(:summarization) { described_class.new(strategy, topic, user) }
def assert_summary_is_cached(topic, summary_response)
cached_summary = AiSummary.find_by(target: topic)
@ -72,14 +72,14 @@ describe DiscourseAi::TopicSummarization do
it "caches the summary" do
DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
section = summarization.summarize(topic, user)
section = summarization.summarize
expect(section.summarized_text).to eq(summary)
assert_summary_is_cached(topic, summary)
end
end
it "returns the cached version in subsequent calls" do
summarization.summarize(topic, user)
summarization.summarize
cached_summary_text = "This is a cached summary"
AiSummary.find_by(target: topic).update!(
@ -87,7 +87,8 @@ describe DiscourseAi::TopicSummarization do
updated_at: 24.hours.ago,
)
section = summarization.summarize(topic, user)
summarization = described_class.new(strategy, topic, user)
section = summarization.summarize
expect(section.summarized_text).to eq(cached_summary_text)
end
@ -101,7 +102,7 @@ describe DiscourseAi::TopicSummarization do
)
DiscourseAi::Completions::Llm.with_prepared_responses(["A summary"]) do |spy|
summarization.summarize(topic, user)
summarization.summarize
prompt_raw =
spy
@ -133,7 +134,7 @@ describe DiscourseAi::TopicSummarization do
# so we create the cached summary totally independantly
DiscourseAi::Completions::Llm.with_prepared_responses([cached_text]) do
strategy = DiscourseAi::Summarization.default_strategy
described_class.new(strategy).summarize(topic, user)
described_class.new(strategy, topic, user).summarize
end
cached_summary.update!(summarized_text: cached_text, created_at: 24.hours.ago)
@ -142,7 +143,7 @@ describe DiscourseAi::TopicSummarization do
context "when the user can requests new summaries" do
context "when there are no new posts" do
it "returns the cached summary" do
section = summarization.summarize(topic, user)
section = summarization.summarize
expect(section.summarized_text).to eq(cached_text)
end
@ -153,7 +154,7 @@ describe DiscourseAi::TopicSummarization do
it "returns a new summary" do
DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
section = summarization.summarize(topic, user)
section = summarization.summarize
expect(section.summarized_text).to eq(updated_summary)
end
@ -165,7 +166,7 @@ describe DiscourseAi::TopicSummarization do
it "returns the cached summary" do
cached_summary.update!(created_at: 30.minutes.ago)
section = summarization.summarize(topic, user)
section = summarization.summarize
expect(section.summarized_text).to eq(cached_text)
expect(section.outdated).to eq(true)
@ -173,7 +174,7 @@ describe DiscourseAi::TopicSummarization do
it "returns a new summary if the skip_age_check flag is passed" do
DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
section = summarization.summarize(topic, user, skip_age_check: true)
section = summarization.summarize(skip_age_check: true)
expect(section.summarized_text).to eq(updated_summary)
end
@ -190,9 +191,7 @@ describe DiscourseAi::TopicSummarization do
partial_result = +""
DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
summarization.summarize(topic, user) do |partial_summary|
partial_result << partial_summary
end
summarization.summarize { |partial_summary| partial_result << partial_summary }
end
expect(partial_result).to eq(summary)