2024-10-18 18:01:39 -03:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
module Summarization
|
|
|
|
module Strategies
|
|
|
|
class HotTopicGists < Base
|
|
|
|
def type
|
|
|
|
AiSummary.summary_types[:gist]
|
|
|
|
end
|
|
|
|
|
2024-10-29 08:45:14 -03:00
|
|
|
def feature
|
|
|
|
"gists"
|
|
|
|
end
|
|
|
|
|
FIX: Make summaries backfill job more resilient. (#1071)
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included.
Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates.
The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future.
Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
2025-01-16 09:42:53 -03:00
|
|
|
def highest_target_number
|
|
|
|
target.highest_post_number
|
|
|
|
end
|
|
|
|
|
2024-10-18 18:01:39 -03:00
|
|
|
def targets_data
|
|
|
|
op_post_number = 1
|
|
|
|
|
|
|
|
hot_topics_recent_cutoff = Time.zone.now - SiteSetting.hot_topics_recent_days.days
|
|
|
|
|
|
|
|
recent_hot_posts =
|
|
|
|
Post
|
|
|
|
.where(topic_id: target.id)
|
|
|
|
.where("post_type = ?", Post.types[:regular])
|
|
|
|
.where("NOT hidden")
|
|
|
|
.where("created_at >= ?", hot_topics_recent_cutoff)
|
|
|
|
.pluck(:post_number)
|
|
|
|
|
|
|
|
# It may happen that a topic is hot without any recent posts
|
|
|
|
# In that case, we'll just grab the last 20 posts
|
|
|
|
# for an useful summary of the current state of the topic
|
|
|
|
if recent_hot_posts.empty?
|
|
|
|
recent_hot_posts =
|
|
|
|
Post
|
|
|
|
.where(topic_id: target.id)
|
|
|
|
.where("post_type = ?", Post.types[:regular])
|
|
|
|
.where("NOT hidden")
|
|
|
|
.order("post_number DESC")
|
|
|
|
.limit(20)
|
|
|
|
.pluck(:post_number)
|
|
|
|
end
|
FIX: Make summaries backfill job more resilient. (#1071)
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included.
Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates.
The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future.
Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
2025-01-16 09:42:53 -03:00
|
|
|
|
2024-10-18 18:01:39 -03:00
|
|
|
posts_data =
|
|
|
|
Post
|
|
|
|
.where(topic_id: target.id)
|
|
|
|
.joins(:user)
|
|
|
|
.where("post_number IN (?)", recent_hot_posts << op_post_number)
|
|
|
|
.order(:post_number)
|
|
|
|
.pluck(:post_number, :raw, :username)
|
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
posts_data.reduce([]) do |memo, (pn, raw, username)|
|
2024-10-18 18:01:39 -03:00
|
|
|
raw_text = raw
|
|
|
|
|
|
|
|
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
|
|
|
raw_text = target.topic_embed&.embed_content_cache
|
|
|
|
end
|
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
memo << { poster: username, id: pn, text: raw_text }
|
2024-10-18 18:01:39 -03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2024-10-31 12:17:42 -03:00
|
|
|
def summary_extension_prompt(summary, contents)
|
2024-10-25 11:51:17 -03:00
|
|
|
statements =
|
|
|
|
contents
|
|
|
|
.to_a
|
|
|
|
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
|
|
|
.join("\n")
|
|
|
|
|
2024-12-10 05:59:19 +11:00
|
|
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip, topic_id: target.id)
|
2024-10-25 11:51:17 -03:00
|
|
|
You are an advanced summarization bot. Your task is to update an existing single-sentence summary by integrating new developments from a conversation.
|
|
|
|
Analyze the most recent messages to identify key updates or shifts in the main topic and reflect these in the updated summary.
|
|
|
|
Emphasize new significant information or developments within the context of the initial conversation theme.
|
2024-10-18 18:01:39 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
### Guidelines:
|
|
|
|
|
|
|
|
- Ensure the revised summary remains concise and objective, maintaining a focus on the central theme or issue.
|
|
|
|
- Omit extraneous details or subjective opinions.
|
2024-10-18 18:01:39 -03:00
|
|
|
- Use the original language of the text.
|
|
|
|
- Begin directly with the main topic or issue, avoiding introductory phrases.
|
2024-11-01 13:09:03 -03:00
|
|
|
- Limit the updated summary to a maximum of 40 words.
|
|
|
|
- Return the 40-word summary inside <ai></ai> tags.
|
2024-10-25 11:51:17 -03:00
|
|
|
|
2024-10-18 18:01:39 -03:00
|
|
|
TEXT
|
|
|
|
|
|
|
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
2024-10-25 11:51:17 -03:00
|
|
|
### Context:
|
|
|
|
|
|
|
|
This is the existing single-sentence summary:
|
|
|
|
|
|
|
|
#{summary}
|
2024-10-18 18:01:39 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
And these are the new developments in the conversation:
|
|
|
|
|
|
|
|
#{statements}
|
|
|
|
|
|
|
|
Your task is to update an existing single-sentence summary by integrating new developments from a conversation.
|
2024-11-01 13:09:03 -03:00
|
|
|
Return the 40-word summary inside <ai></ai> tags.
|
2024-10-18 18:01:39 -03:00
|
|
|
TEXT
|
|
|
|
|
|
|
|
prompt
|
|
|
|
end
|
|
|
|
|
2024-10-31 12:17:42 -03:00
|
|
|
def first_summary_prompt(contents)
|
2024-10-25 11:51:17 -03:00
|
|
|
content_title = target.title
|
|
|
|
statements =
|
|
|
|
contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
2024-10-18 18:01:39 -03:00
|
|
|
|
2024-12-10 05:59:19 +11:00
|
|
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip, topic_id: target.id)
|
2024-10-25 12:12:33 -03:00
|
|
|
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
|
2024-10-18 18:01:39 -03:00
|
|
|
single-sentence summary that conveys the main topic and current developments to someone with no prior context.
|
|
|
|
|
|
|
|
### Guidelines:
|
2024-10-25 12:12:33 -03:00
|
|
|
|
2024-10-18 18:01:39 -03:00
|
|
|
- Emphasize the most recent updates while considering their significance within the original post.
|
|
|
|
- Focus on the central theme or issue being addressed, maintaining an objective and neutral tone.
|
|
|
|
- Exclude extraneous details or subjective opinions.
|
|
|
|
- Use the original language of the text.
|
|
|
|
- Begin directly with the main topic or issue, avoiding introductory phrases.
|
2024-11-01 13:09:03 -03:00
|
|
|
- Limit the summary to a maximum of 40 words.
|
2024-10-25 12:12:33 -03:00
|
|
|
- Do *NOT* repeat the discussion title in the summary.
|
2024-10-21 17:53:48 -03:00
|
|
|
|
2024-10-25 12:12:33 -03:00
|
|
|
Return the summary inside <ai></ai> tags.\n
|
2024-10-18 18:01:39 -03:00
|
|
|
TEXT
|
|
|
|
|
2024-10-21 17:53:48 -03:00
|
|
|
context = +<<~TEXT
|
2024-10-18 18:01:39 -03:00
|
|
|
### Context:
|
2024-10-25 12:12:33 -03:00
|
|
|
|
|
|
|
#{content_title.present? ? "The discussion title is: " + content_title + ". (DO NOT REPEAT THIS IN THE SUMMARY)\n" : ""}
|
2024-12-10 05:59:19 +11:00
|
|
|
|
2024-10-21 17:53:48 -03:00
|
|
|
The conversation began with the following statement:
|
2024-12-10 05:59:19 +11:00
|
|
|
|
2024-10-31 12:17:42 -03:00
|
|
|
#{statements.shift}\n
|
2024-10-21 17:53:48 -03:00
|
|
|
TEXT
|
|
|
|
|
|
|
|
if statements.present?
|
|
|
|
context << <<~TEXT
|
|
|
|
Subsequent discussion includes the following:
|
2024-10-18 18:01:39 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
#{statements.join("\n")}
|
2024-10-25 12:12:33 -03:00
|
|
|
|
2024-10-21 17:53:48 -03:00
|
|
|
Your task is to focus on these latest messages, capturing their meaning in the context of the initial statement.
|
|
|
|
TEXT
|
|
|
|
else
|
|
|
|
context << "Your task is to capture the meaning of the initial statement."
|
|
|
|
end
|
|
|
|
|
|
|
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
2024-11-01 13:09:03 -03:00
|
|
|
#{context} Return the 40-word summary inside <ai></ai> tags.
|
2024-10-18 18:01:39 -03:00
|
|
|
TEXT
|
|
|
|
|
|
|
|
prompt
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|