2023-06-27 10:44:34 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
class TopicSummarization
|
|
|
|
def initialize(strategy)
|
|
|
|
@strategy = strategy
|
|
|
|
end
|
|
|
|
|
2023-08-11 14:08:49 -04:00
|
|
|
def summarize(topic, user, opts = {}, &on_partial_blk)
|
2023-07-12 10:21:51 -04:00
|
|
|
existing_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
|
2023-06-27 10:44:34 -04:00
|
|
|
|
2023-07-12 10:21:51 -04:00
|
|
|
# Existing summary shouldn't be nil in this scenario because the controller checks its existence.
|
2023-07-20 14:25:46 -04:00
|
|
|
return if !user && !existing_summary
|
2023-06-27 10:44:34 -04:00
|
|
|
|
2023-07-20 14:25:46 -04:00
|
|
|
targets_data = summary_targets(topic).pluck(:post_number, :raw, :username)
|
|
|
|
|
|
|
|
current_topic_sha = build_sha(targets_data.map(&:first))
|
|
|
|
can_summarize = Summarization::Base.can_request_summary_for?(user)
|
|
|
|
|
|
|
|
if use_cached?(existing_summary, can_summarize, current_topic_sha, !!opts[:skip_age_check])
|
|
|
|
# It's important that we signal a cached summary is outdated
|
2023-09-06 11:09:21 -04:00
|
|
|
existing_summary.mark_as_outdated if new_targets?(existing_summary, current_topic_sha)
|
2023-07-20 14:25:46 -04:00
|
|
|
|
|
|
|
return existing_summary
|
|
|
|
end
|
2023-06-27 10:44:34 -04:00
|
|
|
|
2023-07-12 10:21:51 -04:00
|
|
|
delete_cached_summaries_of(topic) if existing_summary
|
2023-06-27 10:44:34 -04:00
|
|
|
|
2023-07-12 10:21:51 -04:00
|
|
|
content = {
|
|
|
|
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
|
|
|
|
content_title: topic.title,
|
|
|
|
contents: [],
|
|
|
|
}
|
2023-06-27 10:44:34 -04:00
|
|
|
|
2023-07-12 10:21:51 -04:00
|
|
|
targets_data.map do |(pn, raw, username)|
|
2024-01-09 12:00:01 -05:00
|
|
|
raw_text = raw
|
|
|
|
|
|
|
|
if pn == 1 && topic.topic_embed&.embed_content_cache.present?
|
|
|
|
raw_text = topic.topic_embed&.embed_content_cache
|
|
|
|
end
|
|
|
|
|
|
|
|
content[:contents] << { poster: username, id: pn, text: raw_text }
|
2023-06-27 10:44:34 -04:00
|
|
|
end
|
2023-07-12 10:21:51 -04:00
|
|
|
|
2023-11-21 11:27:27 -05:00
|
|
|
summarization_result = strategy.summarize(content, user, &on_partial_blk)
|
2023-07-12 10:21:51 -04:00
|
|
|
|
|
|
|
cache_summary(summarization_result, targets_data.map(&:first), topic)
|
2023-06-27 10:44:34 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def summary_targets(topic)
|
2023-07-20 14:25:46 -04:00
|
|
|
topic.has_summary? ? best_replies(topic) : pick_selection(topic)
|
2023-06-27 10:44:34 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
attr_reader :strategy
|
|
|
|
|
|
|
|
def best_replies(topic)
|
|
|
|
Post
|
|
|
|
.summary(topic.id)
|
|
|
|
.where("post_type = ?", Post.types[:regular])
|
|
|
|
.where("NOT hidden")
|
|
|
|
.joins(:user)
|
|
|
|
.order(:post_number)
|
|
|
|
end
|
|
|
|
|
|
|
|
def pick_selection(topic)
|
|
|
|
posts =
|
|
|
|
Post
|
|
|
|
.where(topic_id: topic.id)
|
|
|
|
.where("post_type = ?", Post.types[:regular])
|
|
|
|
.where("NOT hidden")
|
|
|
|
.order(:post_number)
|
|
|
|
|
|
|
|
post_numbers = posts.limit(5).pluck(:post_number)
|
|
|
|
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
|
|
|
|
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
|
|
|
|
|
|
|
|
Post
|
|
|
|
.where(topic_id: topic.id)
|
|
|
|
.joins(:user)
|
|
|
|
.where("post_number in (?)", post_numbers)
|
|
|
|
.order(:post_number)
|
|
|
|
end
|
|
|
|
|
2023-07-12 10:21:51 -04:00
|
|
|
def delete_cached_summaries_of(topic)
|
|
|
|
SummarySection.where(target: topic).destroy_all
|
|
|
|
end
|
|
|
|
|
2023-07-20 14:25:46 -04:00
|
|
|
# For users without permissions to generate a summary or fresh summaries, we return what we have cached.
|
|
|
|
def use_cached?(existing_summary, can_summarize, current_sha, skip_age_check)
|
|
|
|
existing_summary &&
|
|
|
|
!(
|
|
|
|
can_summarize && new_targets?(existing_summary, current_sha) &&
|
|
|
|
(skip_age_check || existing_summary.created_at < 1.hour.ago)
|
|
|
|
)
|
|
|
|
end
|
2023-07-12 10:21:51 -04:00
|
|
|
|
2023-07-20 14:25:46 -04:00
|
|
|
def new_targets?(summary, current_sha)
|
|
|
|
summary.original_content_sha != current_sha
|
2023-07-12 10:21:51 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def cache_summary(result, post_numbers, topic)
|
2023-06-27 10:44:34 -04:00
|
|
|
main_summary =
|
|
|
|
SummarySection.create!(
|
|
|
|
target: topic,
|
2024-05-13 14:54:36 -04:00
|
|
|
algorithm: strategy.display_name,
|
2023-06-27 10:44:34 -04:00
|
|
|
content_range: (post_numbers.first..post_numbers.last),
|
|
|
|
summarized_text: result[:summary],
|
2023-07-20 14:25:46 -04:00
|
|
|
original_content_sha: build_sha(post_numbers),
|
2023-06-27 10:44:34 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
result[:chunks].each do |chunk|
|
|
|
|
SummarySection.create!(
|
|
|
|
target: topic,
|
2024-05-13 14:54:36 -04:00
|
|
|
algorithm: strategy.display_name,
|
2023-06-27 10:44:34 -04:00
|
|
|
content_range: chunk[:ids].min..chunk[:ids].max,
|
|
|
|
summarized_text: chunk[:summary],
|
2023-07-20 14:25:46 -04:00
|
|
|
original_content_sha: build_sha(chunk[:ids]),
|
2023-06-27 10:44:34 -04:00
|
|
|
meta_section_id: main_summary.id,
|
|
|
|
)
|
|
|
|
end
|
2023-07-12 10:21:51 -04:00
|
|
|
|
2023-08-15 14:03:42 -04:00
|
|
|
main_summary
|
2023-06-27 10:44:34 -04:00
|
|
|
end
|
2023-07-20 14:25:46 -04:00
|
|
|
|
|
|
|
def build_sha(ids)
|
|
|
|
Digest::SHA256.hexdigest(ids.join)
|
|
|
|
end
|
2023-06-27 10:44:34 -04:00
|
|
|
end
|