discourse-ai/app/services/discourse_ai/topic_summarization.rb
Sam 14443bf890
FIX: more robust summary implementation (#750)
When navigating between topic we were not correctly resetting
internal state for summarization. This leads to a situation where
incorrect summaries can be displayed to users and wrong summaries
can be displayed.

Additionally our controller for grabbing summaries was always
streaming results via message bus, which could be delayed when
sidekiq is overloaded. We now will return the cached summary
right away if it is available direct from REST endpoint.
2024-08-13 08:47:47 -03:00

146 lines
3.8 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
class TopicSummarization
def self.summarize(topic, user, skip_age_check: false, &on_partial_blk)
new(DiscourseAi::Summarization.default_strategy, topic, user).summarize(
skip_age_check: skip_age_check,
&on_partial_blk
)
end
def self.cached_summary(topic, user)
new(DiscourseAi::Summarization.default_strategy, topic, user).cached_summary
end
def initialize(strategy, topic, user)
@strategy = strategy
@topic = topic
@user = user
end
attr_reader :strategy, :topic, :user
def cached_summary
existing_summary
end
def summarize(skip_age_check: false, &on_partial_blk)
# Existing summary shouldn't be nil in this scenario because the controller checks its existence.
return if !user && !existing_summary
return existing_summary if use_cached?(skip_age_check)
delete_cached_summaries! if existing_summary
content = {
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
content_title: topic.title,
contents: [],
}
summary_targets_data.map do |(pn, raw, username)|
raw_text = raw
if pn == 1 && topic.topic_embed&.embed_content_cache.present?
raw_text = topic.topic_embed&.embed_content_cache
end
content[:contents] << { poster: username, id: pn, text: raw_text }
end
summarization_result = strategy.summarize(content, user, &on_partial_blk)
cache_summary(summarization_result)
end
def summary_targets
topic.has_summary? ? best_replies : pick_selection
end
private
def summary_sha
@summary_sha ||= build_sha(summary_targets_data.map(&:first))
end
def summary_targets_data
@summary_targets_data ||= summary_targets.pluck(:post_number, :raw, :username)
end
def existing_summary
if !defined?(@existing_summary)
@existing_summary = AiSummary.find_by(target: topic)
if @existing_summary && existing_summary.original_content_sha != summary_sha
@existing_summary.mark_as_outdated
end
end
@existing_summary
end
def best_replies
Post
.summary(topic.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.joins(:user)
.order(:post_number)
end
def pick_selection
posts =
Post
.where(topic_id: topic.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.order(:post_number)
post_numbers = posts.limit(5).pluck(:post_number)
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
Post
.where(topic_id: topic.id)
.joins(:user)
.where("post_number in (?)", post_numbers)
.order(:post_number)
end
def delete_cached_summaries!
AiSummary.where(target: topic).destroy_all
end
def use_cached?(skip_age_check)
can_summarize = Guardian.new(user).can_request_summary?
existing_summary &&
!(
can_summarize && new_targets? &&
(skip_age_check || existing_summary.created_at < 1.hour.ago)
)
end
def new_targets?
existing_summary&.original_content_sha != summary_sha
end
def cache_summary(result)
post_numbers = summary_targets_data.map(&:first)
cached_summary =
AiSummary.create!(
target: topic,
algorithm: strategy.display_name,
content_range: (post_numbers.first..post_numbers.last),
summarized_text: result[:summary],
original_content_sha: summary_sha,
)
cached_summary
end
def build_sha(ids)
Digest::SHA256.hexdigest(ids.join)
end
end
end