2024-10-15 13:53:26 -03:00
# frozen_string_literal: true
module DiscourseAi
module Summarization
module Strategies
class TopicSummary < Base
def type
AiSummary . summary_types [ :complete ]
end
FIX: Make summaries backfill job more resilient. (#1071)
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included.
Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates.
The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future.
Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
2025-01-16 09:42:53 -03:00
def highest_target_number
target . highest_post_number
end
2024-10-15 13:53:26 -03:00
def targets_data
2025-03-10 22:29:13 -05:00
post_attributes = % i [ post_number raw username last_version_at ]
if SiteSetting . enable_names && ! SiteSetting . prioritize_username_in_ux
post_attributes . push ( :name )
end
posts_data = ( target . has_summary? ? best_replies : pick_selection ) . pluck ( post_attributes )
posts_data . reduce ( [ ] ) do | memo , ( pn , raw , username , last_version_at , name ) |
2024-10-15 13:53:26 -03:00
raw_text = raw
if pn == 1 && target . topic_embed & . embed_content_cache . present?
raw_text = target . topic_embed & . embed_content_cache
end
2025-03-10 22:29:13 -05:00
display_name = name . presence || username
memo << {
poster : display_name ,
id : pn ,
text : raw_text ,
last_version_at : last_version_at ,
}
2024-10-15 13:53:26 -03:00
end
end
2025-04-02 12:54:47 -03:00
def as_llm_messages ( contents )
2024-10-25 11:51:17 -03:00
resource_path = " #{ Discourse . base_path } /t/-/ #{ target . id } "
content_title = target . title
input =
contents . map { | item | " ( #{ item [ :id ] } #{ item [ :poster ] } said: #{ item [ :text ] } " } . join
2025-04-02 12:54:47 -03:00
messages = [ ]
messages << {
2024-10-25 11:51:17 -03:00
type : :user ,
content :
" Here are the posts inside <input></input> XML tags: \n \n <input>1) user1 said: I love Mondays 2) user2 said: I hate Mondays</input> \n \n Generate a concise, coherent summary of the text above maintaining the original language. " ,
2025-04-02 12:54:47 -03:00
}
messages << {
2024-10-25 11:51:17 -03:00
type : :model ,
content :
" Two users are sharing their feelings toward Mondays. [user1]( #{ resource_path } /1) hates them, while [user2]( #{ resource_path } /2) loves them. " ,
2025-04-02 12:54:47 -03:00
}
2024-10-15 13:53:26 -03:00
2025-04-02 12:54:47 -03:00
messages << { type : :user , content : << ~ TEXT . strip }
2024-10-25 11:51:17 -03:00
#{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""}
Here are the posts , inside < input > < / input> XML tags:
2024-10-15 13:53:26 -03:00
2024-10-25 11:51:17 -03:00
< input >
#{input}
< / input>
2024-10-15 13:53:26 -03:00
2024-10-25 11:51:17 -03:00
Generate a concise , coherent summary of the text above maintaining the original language .
TEXT
2024-10-15 13:53:26 -03:00
2025-04-02 12:54:47 -03:00
messages
2024-10-15 13:53:26 -03:00
end
private
attr_reader :topic
def best_replies
Post
. summary ( target . id )
. where ( " post_type = ? " , Post . types [ :regular ] )
. where ( " NOT hidden " )
. joins ( :user )
. order ( :post_number )
end
def pick_selection
posts =
Post
. where ( topic_id : target . id )
. where ( " post_type = ? " , Post . types [ :regular ] )
. where ( " NOT hidden " )
. order ( :post_number )
post_numbers = posts . limit ( 5 ) . pluck ( :post_number )
post_numbers += posts . reorder ( " posts.score desc " ) . limit ( 50 ) . pluck ( :post_number )
post_numbers += posts . reorder ( " post_number desc " ) . limit ( 5 ) . pluck ( :post_number )
Post
. where ( topic_id : target . id )
. joins ( :user )
. where ( " post_number in (?) " , post_numbers )
. order ( :post_number )
end
end
end
end
end