2024-10-15 13:53:26 -03:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
module Summarization
|
|
|
|
# This class offers a generic way of summarizing content from multiple sources using different prompts.
|
|
|
|
#
|
|
|
|
# It summarizes large amounts of content by recursively summarizing it in smaller chunks that
|
|
|
|
# fit the given model context window, finally concatenating the disjoint summaries
|
|
|
|
# into a final version.
|
|
|
|
#
|
|
|
|
class FoldContent
|
|
|
|
def initialize(llm, strategy, persist_summaries: true)
|
|
|
|
@llm = llm
|
|
|
|
@strategy = strategy
|
|
|
|
@persist_summaries = persist_summaries
|
|
|
|
end
|
|
|
|
|
|
|
|
attr_reader :llm, :strategy
|
|
|
|
|
|
|
|
# @param user { User } - User object used for auditing usage.
|
|
|
|
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
|
|
|
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
|
|
|
#
|
FIX: Make summaries backfill job more resilient. (#1071)
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included.
Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates.
The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future.
Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
2025-01-16 09:42:53 -03:00
|
|
|
# This method doesn't care if we already have an up to date summary. It always regenerate.
|
|
|
|
#
|
2024-10-15 13:53:26 -03:00
|
|
|
# @returns { AiSummary } - Resulting summary.
|
|
|
|
def summarize(user, &on_partial_blk)
|
2024-10-25 11:51:17 -03:00
|
|
|
base_summary = ""
|
|
|
|
initial_pos = 0
|
2024-10-31 12:17:42 -03:00
|
|
|
|
|
|
|
truncated_content = content_to_summarize.map { |cts| truncate(cts) }
|
|
|
|
|
|
|
|
folded_summary = fold(truncated_content, base_summary, initial_pos, user, &on_partial_blk)
|
2024-10-15 13:53:26 -03:00
|
|
|
|
2024-10-21 17:53:48 -03:00
|
|
|
clean_summary =
|
2024-10-25 11:51:17 -03:00
|
|
|
Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
|
2024-10-21 17:53:48 -03:00
|
|
|
|
2024-10-15 13:53:26 -03:00
|
|
|
if persist_summaries
|
|
|
|
AiSummary.store!(
|
2024-11-04 17:48:11 -03:00
|
|
|
strategy,
|
|
|
|
llm_model,
|
2024-10-21 17:53:48 -03:00
|
|
|
clean_summary,
|
2024-11-04 17:48:11 -03:00
|
|
|
truncated_content,
|
|
|
|
human: user&.human?,
|
2024-10-15 13:53:26 -03:00
|
|
|
)
|
|
|
|
else
|
2024-10-21 17:53:48 -03:00
|
|
|
AiSummary.new(summarized_text: clean_summary)
|
2024-10-15 13:53:26 -03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# @returns { AiSummary } - Resulting summary.
|
|
|
|
#
|
2024-11-08 09:01:18 +09:00
|
|
|
# Finds a summary matching the target and strategy. Marks it as outdated if the strategy found newer content
|
2024-10-15 13:53:26 -03:00
|
|
|
def existing_summary
|
|
|
|
if !defined?(@existing_summary)
|
|
|
|
summary = AiSummary.find_by(target: strategy.target, summary_type: strategy.type)
|
|
|
|
|
|
|
|
if summary
|
|
|
|
@existing_summary = summary
|
|
|
|
|
2025-02-04 09:31:11 -03:00
|
|
|
if summary.original_content_sha != latest_sha ||
|
|
|
|
content_to_summarize.any? { |cts| cts[:last_version_at] > summary.updated_at }
|
|
|
|
summary.mark_as_outdated
|
2024-10-15 13:53:26 -03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
@existing_summary
|
|
|
|
end
|
|
|
|
|
|
|
|
def delete_cached_summaries!
|
|
|
|
AiSummary.where(target: strategy.target, summary_type: strategy.type).destroy_all
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
attr_reader :persist_summaries
|
|
|
|
|
|
|
|
def llm_model
|
|
|
|
llm.llm_model
|
|
|
|
end
|
|
|
|
|
|
|
|
def content_to_summarize
|
|
|
|
@targets_data ||= strategy.targets_data
|
|
|
|
end
|
|
|
|
|
|
|
|
def latest_sha
|
2024-10-25 11:51:17 -03:00
|
|
|
@latest_sha ||= AiSummary.build_sha(content_to_summarize.map { |c| c[:id] }.join)
|
2024-10-15 13:53:26 -03:00
|
|
|
end
|
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
# @param items { Array<Hash> } - Content to summarize. Structure will be: { poster: who wrote the content, id: a way to order content, text: content }
|
|
|
|
# @param summary { String } - Intermediate summaries that we'll keep extending as part of our "folding" algorithm.
|
|
|
|
# @param cursor { Integer } - Idx to know how much we already summarized.
|
|
|
|
# @param user { User } - User object used for auditing usage.
|
|
|
|
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
|
|
|
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
|
|
|
#
|
|
|
|
# The summarization algorithm.
|
|
|
|
# The idea is to build an initial summary packing as much content as we can. Once we have the initial summary, we'll keep extending using the leftover
|
|
|
|
# content until there is nothing left.
|
|
|
|
#
|
|
|
|
# @returns { String } - Resulting summary.
|
|
|
|
def fold(items, summary, cursor, user, &on_partial_blk)
|
|
|
|
tokenizer = llm_model.tokenizer_class
|
|
|
|
tokens_left = available_tokens - tokenizer.size(summary)
|
|
|
|
iteration_content = []
|
2024-10-15 13:53:26 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
items.each_with_index do |item, idx|
|
|
|
|
next if idx < cursor
|
2024-10-15 13:53:26 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
as_text = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
2024-10-15 13:53:26 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
if tokenizer.below_limit?(as_text, tokens_left)
|
|
|
|
iteration_content << item
|
|
|
|
tokens_left -= tokenizer.size(as_text)
|
|
|
|
cursor += 1
|
|
|
|
else
|
|
|
|
break
|
2024-10-15 13:53:26 -03:00
|
|
|
end
|
2024-10-25 11:51:17 -03:00
|
|
|
end
|
2024-10-15 13:53:26 -03:00
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
prompt =
|
|
|
|
(
|
|
|
|
if summary.blank?
|
2024-10-31 12:17:42 -03:00
|
|
|
strategy.first_summary_prompt(iteration_content)
|
2024-10-25 11:51:17 -03:00
|
|
|
else
|
2024-10-31 12:17:42 -03:00
|
|
|
strategy.summary_extension_prompt(summary, iteration_content)
|
2024-10-25 11:51:17 -03:00
|
|
|
end
|
2024-10-15 13:53:26 -03:00
|
|
|
)
|
|
|
|
|
2024-10-25 11:51:17 -03:00
|
|
|
if cursor == items.length
|
2024-10-29 08:45:14 -03:00
|
|
|
llm.generate(prompt, user: user, feature_name: strategy.feature, &on_partial_blk)
|
2024-10-25 11:51:17 -03:00
|
|
|
else
|
|
|
|
latest_summary =
|
2024-10-29 08:45:14 -03:00
|
|
|
llm.generate(prompt, user: user, max_tokens: 600, feature_name: strategy.feature)
|
2024-10-25 11:51:17 -03:00
|
|
|
fold(items, latest_summary, cursor, user, &on_partial_blk)
|
2024-10-15 13:53:26 -03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def available_tokens
|
|
|
|
# Reserve tokens for the response and the base prompt
|
|
|
|
# ~500 words
|
|
|
|
reserved_tokens = 700
|
|
|
|
|
|
|
|
llm_model.max_prompt_tokens - reserved_tokens
|
|
|
|
end
|
2024-10-31 12:17:42 -03:00
|
|
|
|
|
|
|
def truncate(item)
|
|
|
|
item_content = item[:text].to_s
|
|
|
|
split_1, split_2 =
|
|
|
|
[item_content[0, item_content.size / 2], item_content[(item_content.size / 2)..-1]]
|
|
|
|
|
|
|
|
truncation_length = 500
|
|
|
|
tokenizer = llm_model.tokenizer_class
|
|
|
|
|
|
|
|
item[:text] = [
|
|
|
|
tokenizer.truncate(split_1, truncation_length),
|
|
|
|
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
|
|
|
|
].join(" ")
|
|
|
|
|
|
|
|
item
|
|
|
|
end
|
2024-10-15 13:53:26 -03:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|