FIX/REFACTOR: FoldContent revamp (#866)
* FIX/REFACTOR: FoldContent revamp We hit a snag with our hot topic gist strategy: the regex we used to split the content didn't work, so we cannot send the original post separately. This was important for letting the model focus on what's new in the topic. The algorithm doesn’t give us full control over how prompts are written, and figuring out how to format the content isn't straightforward. This means we're having to use more complicated workarounds, like regex. To tackle this, I'm suggesting we simplify the approach a bit. Let's focus on summarizing as much as we can upfront, then gradually add new content until there's nothing left to summarize. Also, the "extend" part is mostly for models with small context windows, which shouldn't pose a problem 99% of the time with the content volume we're dealing with. * Fix fold docs * Use #shift instead of #pop to get the first elem, not the last
This commit is contained in:
parent
12869f2146
commit
ec97996905
|
@ -26,7 +26,7 @@ module DiscourseAi
|
||||||
strategy = DiscourseAi::Summarization::Strategies::ChatMessages.new(channel, since)
|
strategy = DiscourseAi::Summarization::Strategies::ChatMessages.new(channel, since)
|
||||||
|
|
||||||
summarized_text =
|
summarized_text =
|
||||||
if strategy.targets_data[:contents].empty?
|
if strategy.targets_data.empty?
|
||||||
I18n.t("discourse_ai.summarization.chat.no_targets")
|
I18n.t("discourse_ai.summarization.chat.no_targets")
|
||||||
else
|
else
|
||||||
summarizer.summarize(current_user)&.summarized_text
|
summarizer.summarize(current_user)&.summarized_text
|
||||||
|
|
|
@ -18,35 +18,18 @@ module DiscourseAi
|
||||||
attr_reader :llm, :strategy
|
attr_reader :llm, :strategy
|
||||||
|
|
||||||
# @param user { User } - User object used for auditing usage.
|
# @param user { User } - User object used for auditing usage.
|
||||||
#
|
|
||||||
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
||||||
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
||||||
#
|
#
|
||||||
# @returns { AiSummary } - Resulting summary.
|
# @returns { AiSummary } - Resulting summary.
|
||||||
def summarize(user, &on_partial_blk)
|
def summarize(user, &on_partial_blk)
|
||||||
opts = content_to_summarize.except(:contents)
|
base_summary = ""
|
||||||
|
initial_pos = 0
|
||||||
initial_chunks =
|
folded_summary =
|
||||||
rebalance_chunks(
|
fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk)
|
||||||
content_to_summarize[:contents].map do |c|
|
|
||||||
{ ids: [c[:id]], summary: format_content_item(c) }
|
|
||||||
end,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Special case where we can do all the summarization in one pass.
|
|
||||||
result =
|
|
||||||
if initial_chunks.length == 1
|
|
||||||
{
|
|
||||||
summary:
|
|
||||||
summarize_single(initial_chunks.first[:summary], user, opts, &on_partial_blk),
|
|
||||||
chunks: [],
|
|
||||||
}
|
|
||||||
else
|
|
||||||
summarize_chunks(initial_chunks, user, opts, &on_partial_blk)
|
|
||||||
end
|
|
||||||
|
|
||||||
clean_summary =
|
clean_summary =
|
||||||
Nokogiri::HTML5.fragment(result[:summary]).css("ai")&.first&.text || result[:summary]
|
Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
|
||||||
|
|
||||||
if persist_summaries
|
if persist_summaries
|
||||||
AiSummary.store!(
|
AiSummary.store!(
|
||||||
|
@ -54,7 +37,7 @@ module DiscourseAi
|
||||||
strategy.type,
|
strategy.type,
|
||||||
llm_model.name,
|
llm_model.name,
|
||||||
clean_summary,
|
clean_summary,
|
||||||
content_to_summarize[:contents].map { |c| c[:id] },
|
content_to_summarize.map { |c| c[:id] },
|
||||||
)
|
)
|
||||||
else
|
else
|
||||||
AiSummary.new(summarized_text: clean_summary)
|
AiSummary.new(summarized_text: clean_summary)
|
||||||
|
@ -96,90 +79,58 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
|
|
||||||
def latest_sha
|
def latest_sha
|
||||||
@latest_sha ||= AiSummary.build_sha(content_to_summarize[:contents].map { |c| c[:id] }.join)
|
@latest_sha ||= AiSummary.build_sha(content_to_summarize.map { |c| c[:id] }.join)
|
||||||
end
|
end
|
||||||
|
|
||||||
def summarize_chunks(chunks, user, opts, &on_partial_blk)
|
# @param items { Array<Hash> } - Content to summarize. Structure will be: { poster: who wrote the content, id: a way to order content, text: content }
|
||||||
# Safely assume we always have more than one chunk.
|
# @param summary { String } - Intermediate summaries that we'll keep extending as part of our "folding" algorithm.
|
||||||
summarized_chunks = summarize_in_chunks(chunks, user, opts)
|
# @param cursor { Integer } - Idx to know how much we already summarized.
|
||||||
total_summaries_size =
|
# @param user { User } - User object used for auditing usage.
|
||||||
llm_model.tokenizer_class.size(summarized_chunks.map { |s| s[:summary].to_s }.join)
|
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
||||||
|
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
||||||
|
#
|
||||||
|
# The summarization algorithm.
|
||||||
|
# The idea is to build an initial summary packing as much content as we can. Once we have the initial summary, we'll keep extending using the leftover
|
||||||
|
# content until there is nothing left.
|
||||||
|
#
|
||||||
|
# @returns { String } - Resulting summary.
|
||||||
|
def fold(items, summary, cursor, user, &on_partial_blk)
|
||||||
|
tokenizer = llm_model.tokenizer_class
|
||||||
|
tokens_left = available_tokens - tokenizer.size(summary)
|
||||||
|
iteration_content = []
|
||||||
|
|
||||||
if total_summaries_size < available_tokens
|
items.each_with_index do |item, idx|
|
||||||
# Chunks are small enough, we can concatenate them.
|
next if idx < cursor
|
||||||
{
|
|
||||||
summary:
|
|
||||||
concatenate_summaries(
|
|
||||||
summarized_chunks.map { |s| s[:summary] },
|
|
||||||
user,
|
|
||||||
&on_partial_blk
|
|
||||||
),
|
|
||||||
chunks: summarized_chunks,
|
|
||||||
}
|
|
||||||
else
|
|
||||||
# We have summarized chunks but we can't concatenate them yet. Split them into smaller summaries and summarize again.
|
|
||||||
rebalanced_chunks = rebalance_chunks(summarized_chunks)
|
|
||||||
|
|
||||||
summarize_chunks(rebalanced_chunks, user, opts, &on_partial_blk)
|
as_text = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def format_content_item(item)
|
if tokenizer.below_limit?(as_text, tokens_left)
|
||||||
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
iteration_content << item
|
||||||
end
|
tokens_left -= tokenizer.size(as_text)
|
||||||
|
cursor += 1
|
||||||
def rebalance_chunks(chunks)
|
else
|
||||||
section = { ids: [], summary: "" }
|
break
|
||||||
|
|
||||||
chunks =
|
|
||||||
chunks.reduce([]) do |sections, chunk|
|
|
||||||
if llm_model.tokenizer_class.can_expand_tokens?(
|
|
||||||
section[:summary],
|
|
||||||
chunk[:summary],
|
|
||||||
available_tokens,
|
|
||||||
)
|
|
||||||
section[:summary] += chunk[:summary]
|
|
||||||
section[:ids] = section[:ids].concat(chunk[:ids])
|
|
||||||
else
|
|
||||||
sections << section
|
|
||||||
section = chunk
|
|
||||||
end
|
|
||||||
|
|
||||||
sections
|
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
chunks << section if section[:summary].present?
|
prompt =
|
||||||
|
(
|
||||||
chunks
|
if summary.blank?
|
||||||
end
|
strategy.first_summary_prompt(iteration_content)
|
||||||
|
else
|
||||||
def summarize_single(text, user, opts, &on_partial_blk)
|
strategy.summary_extension_prompt(summary, iteration_content)
|
||||||
prompt = strategy.summarize_single_prompt(text, opts)
|
end
|
||||||
|
|
||||||
llm.generate(prompt, user: user, feature_name: "summarize", &on_partial_blk)
|
|
||||||
end
|
|
||||||
|
|
||||||
def summarize_in_chunks(chunks, user, opts)
|
|
||||||
chunks.map do |chunk|
|
|
||||||
prompt = strategy.summarize_single_prompt(chunk[:summary], opts)
|
|
||||||
|
|
||||||
chunk[:summary] = llm.generate(
|
|
||||||
prompt,
|
|
||||||
user: user,
|
|
||||||
max_tokens: 300,
|
|
||||||
feature_name: "summarize",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
chunk
|
if cursor == items.length
|
||||||
|
llm.generate(prompt, user: user, feature_name: "summarize", &on_partial_blk)
|
||||||
|
else
|
||||||
|
latest_summary =
|
||||||
|
llm.generate(prompt, user: user, max_tokens: 600, feature_name: "summarize")
|
||||||
|
fold(items, latest_summary, cursor, user, &on_partial_blk)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def concatenate_summaries(texts_to_summarize, user, &on_partial_blk)
|
|
||||||
prompt = strategy.concatenation_prompt(texts_to_summarize)
|
|
||||||
|
|
||||||
llm.generate(prompt, user: user, &on_partial_blk)
|
|
||||||
end
|
|
||||||
|
|
||||||
def available_tokens
|
def available_tokens
|
||||||
# Reserve tokens for the response and the base prompt
|
# Reserve tokens for the response and the base prompt
|
||||||
# ~500 words
|
# ~500 words
|
||||||
|
|
|
@ -11,7 +11,7 @@ module DiscourseAi
|
||||||
@target = target
|
@target = target
|
||||||
end
|
end
|
||||||
|
|
||||||
attr_reader :target
|
attr_reader :target, :opts
|
||||||
|
|
||||||
# The summary type differentiates instances of `AiSummary` pointing to a single target.
|
# The summary type differentiates instances of `AiSummary` pointing to a single target.
|
||||||
# See the `summary_type` enum for available options.
|
# See the `summary_type` enum for available options.
|
||||||
|
@ -19,11 +19,9 @@ module DiscourseAi
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
end
|
end
|
||||||
|
|
||||||
# @returns { Hash } - Content to summarize.
|
# @returns { Array<Hash> } - Content to summarize.
|
||||||
#
|
#
|
||||||
# This method returns a hash with the content to summarize and additional information.
|
# This method returns an array of hashes with the content to summarize using the following structure:
|
||||||
# The only mandatory key is `contents`, which must be an array of hashes with
|
|
||||||
# the following structure:
|
|
||||||
#
|
#
|
||||||
# {
|
# {
|
||||||
# poster: A way to tell who write the content,
|
# poster: A way to tell who write the content,
|
||||||
|
@ -31,26 +29,17 @@ module DiscourseAi
|
||||||
# text: Text to summarize
|
# text: Text to summarize
|
||||||
# }
|
# }
|
||||||
#
|
#
|
||||||
# Additionally, you could add more context, which will be available in the prompt. e.g.:
|
|
||||||
#
|
|
||||||
# {
|
|
||||||
# resource_path: "#{Discourse.base_path}/t/-/#{target.id}",
|
|
||||||
# content_title: target.title,
|
|
||||||
# contents: [...]
|
|
||||||
# }
|
|
||||||
#
|
|
||||||
def targets_data
|
def targets_data
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
end
|
end
|
||||||
|
|
||||||
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when concatenating multiple chunks.
|
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
|
||||||
def contatenation_prompt(_texts_to_summarize)
|
def summary_extension_prompt(_summary, _texts_to_summarize)
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
end
|
end
|
||||||
|
|
||||||
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM on each chunk,
|
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
|
||||||
# and when the whole content fits in one call.
|
def first_summary_prompt(_input)
|
||||||
def summarize_single_prompt(_input, _opts)
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -14,38 +14,60 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
|
|
||||||
def targets_data
|
def targets_data
|
||||||
content = { content_title: target.name }
|
target
|
||||||
|
|
||||||
content[:contents] = target
|
|
||||||
.chat_messages
|
.chat_messages
|
||||||
.where("chat_messages.created_at > ?", since.hours.ago)
|
.where("chat_messages.created_at > ?", since.hours.ago)
|
||||||
.includes(:user)
|
.includes(:user)
|
||||||
.order(created_at: :asc)
|
.order(created_at: :asc)
|
||||||
.pluck(:id, :username_lower, :message)
|
.pluck(:id, :username_lower, :message)
|
||||||
.map { { id: _1, poster: _2, text: _3 } }
|
.map { { id: _1, poster: _2, text: _3 } }
|
||||||
|
|
||||||
content
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def contatenation_prompt(texts_to_summarize)
|
def summary_extension_prompt(summary, contents)
|
||||||
|
input =
|
||||||
|
contents
|
||||||
|
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
.join("\n")
|
||||||
|
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||||
You are a summarization bot tasked with creating a cohesive narrative by intelligently merging multiple disjointed summaries.
|
You are a summarization bot tasked with expanding on an existing summary by incorporating new chat messages.
|
||||||
Your response should consist of well-structured paragraphs that combines these summaries into a clear and comprehensive overview.
|
Your goal is to seamlessly integrate the additional information into the existing summary, preserving the clarity and insights of the original while reflecting any new developments, themes, or conclusions.
|
||||||
Avoid adding any additional text or commentary. Format your output using Discourse forum Markdown.
|
Analyze the new messages to identify key themes, participants' intentions, and any significant decisions or resolutions.
|
||||||
|
Update the summary to include these aspects in a way that remains concise, comprehensive, and accessible to someone with no prior context of the conversation.
|
||||||
|
|
||||||
|
### Guidelines:
|
||||||
|
|
||||||
|
- Merge the new information naturally with the existing summary without redundancy.
|
||||||
|
- Only include the updated summary, WITHOUT additional commentary.
|
||||||
|
- Don't mention the channel title. Avoid extraneous details or subjective opinions.
|
||||||
|
- Maintain the original language of the text being summarized.
|
||||||
|
- The same user could write multiple messages in a row, don't treat them as different persons.
|
||||||
|
- Aim for summaries to be extended by a reasonable amount, but strive to maintain a total length of 400 words or less, unless absolutely necessary for comprehensiveness.
|
||||||
|
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
### Context:
|
||||||
|
|
||||||
<input>
|
This is the existing summary:
|
||||||
#{texts_to_summarize.join("\n")}
|
|
||||||
</input>
|
#{summary}
|
||||||
|
|
||||||
|
These are the new chat messages:
|
||||||
|
|
||||||
|
#{input}
|
||||||
|
|
||||||
|
Intengrate the new messages into the existing summary.
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
||||||
def summarize_single_prompt(input, opts)
|
def first_summary_prompt(contents)
|
||||||
|
content_title = target.name
|
||||||
|
input =
|
||||||
|
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
|
||||||
|
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||||
You are a summarization bot designed to generate clear and insightful paragraphs that conveys the main topics
|
You are a summarization bot designed to generate clear and insightful paragraphs that conveys the main topics
|
||||||
and developments from a series of chat messages within a user-selected time window.
|
and developments from a series of chat messages within a user-selected time window.
|
||||||
|
@ -62,7 +84,7 @@ module DiscourseAi
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||||
#{opts[:content_title].present? ? "The name of the channel is: " + opts[:content_title] + ".\n" : ""}
|
#{content_title.present? ? "The name of the channel is: " + content_title + ".\n" : ""}
|
||||||
|
|
||||||
Here are the messages, inside <input></input> XML tags:
|
Here are the messages, inside <input></input> XML tags:
|
||||||
|
|
||||||
|
|
|
@ -9,8 +9,6 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
|
|
||||||
def targets_data
|
def targets_data
|
||||||
content = { content_title: target.title, contents: [] }
|
|
||||||
|
|
||||||
op_post_number = 1
|
op_post_number = 1
|
||||||
|
|
||||||
hot_topics_recent_cutoff = Time.zone.now - SiteSetting.hot_topics_recent_days.days
|
hot_topics_recent_cutoff = Time.zone.now - SiteSetting.hot_topics_recent_days.days
|
||||||
|
@ -44,44 +42,62 @@ module DiscourseAi
|
||||||
.order(:post_number)
|
.order(:post_number)
|
||||||
.pluck(:post_number, :raw, :username)
|
.pluck(:post_number, :raw, :username)
|
||||||
|
|
||||||
posts_data.each do |(pn, raw, username)|
|
posts_data.reduce([]) do |memo, (pn, raw, username)|
|
||||||
raw_text = raw
|
raw_text = raw
|
||||||
|
|
||||||
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
||||||
raw_text = target.topic_embed&.embed_content_cache
|
raw_text = target.topic_embed&.embed_content_cache
|
||||||
end
|
end
|
||||||
|
|
||||||
content[:contents] << { poster: username, id: pn, text: raw_text }
|
memo << { poster: username, id: pn, text: raw_text }
|
||||||
end
|
end
|
||||||
|
|
||||||
content
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def concatenation_prompt(texts_to_summarize)
|
def summary_extension_prompt(summary, contents)
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
statements =
|
||||||
You are a summarization bot tasked with creating a single, concise sentence by merging disjointed summaries into a cohesive statement.
|
contents
|
||||||
Your response should strictly be this single, comprehensive sentence, without any additional text or comments.
|
.to_a
|
||||||
|
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
.join("\n")
|
||||||
|
|
||||||
- Focus on the central theme or issue being addressed, maintaining an objective and neutral tone.
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||||
- Exclude extraneous details or subjective opinions.
|
You are an advanced summarization bot. Your task is to update an existing single-sentence summary by integrating new developments from a conversation.
|
||||||
|
Analyze the most recent messages to identify key updates or shifts in the main topic and reflect these in the updated summary.
|
||||||
|
Emphasize new significant information or developments within the context of the initial conversation theme.
|
||||||
|
|
||||||
|
### Guidelines:
|
||||||
|
|
||||||
|
- Ensure the revised summary remains concise and objective, maintaining a focus on the central theme or issue.
|
||||||
|
- Omit extraneous details or subjective opinions.
|
||||||
- Use the original language of the text.
|
- Use the original language of the text.
|
||||||
- Begin directly with the main topic or issue, avoiding introductory phrases.
|
- Begin directly with the main topic or issue, avoiding introductory phrases.
|
||||||
- Limit the summary to a maximum of 20 words.
|
- Limit the updated summary to a maximum of 20 words.
|
||||||
|
- Return the 20-word summary inside <ai></ai> tags.
|
||||||
|
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
### Context:
|
||||||
|
|
||||||
<input>
|
This is the existing single-sentence summary:
|
||||||
#{texts_to_summarize.join("\n")}
|
|
||||||
</input>
|
#{summary}
|
||||||
|
|
||||||
|
And these are the new developments in the conversation:
|
||||||
|
|
||||||
|
#{statements}
|
||||||
|
|
||||||
|
Your task is to update an existing single-sentence summary by integrating new developments from a conversation.
|
||||||
|
Return the 20-word summary inside <ai></ai> tags.
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
||||||
def summarize_single_prompt(input, opts)
|
def first_summary_prompt(contents)
|
||||||
statements = input.split(/(?=\d+\) \w+ said:)/)
|
content_title = target.title
|
||||||
|
statements =
|
||||||
|
contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
|
||||||
|
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||||
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
|
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
|
||||||
|
@ -95,25 +111,25 @@ module DiscourseAi
|
||||||
- Use the original language of the text.
|
- Use the original language of the text.
|
||||||
- Begin directly with the main topic or issue, avoiding introductory phrases.
|
- Begin directly with the main topic or issue, avoiding introductory phrases.
|
||||||
- Limit the summary to a maximum of 20 words.
|
- Limit the summary to a maximum of 20 words.
|
||||||
|
- Return the 20-word summary inside <ai></ai> tags.
|
||||||
|
|
||||||
Return the 20-word summary inside <ai></ai> tags.
|
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
context = +<<~TEXT
|
context = +<<~TEXT
|
||||||
### Context:
|
### Context:
|
||||||
|
|
||||||
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
|
#{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""}
|
||||||
|
|
||||||
The conversation began with the following statement:
|
The conversation began with the following statement:
|
||||||
|
|
||||||
#{statements&.pop}\n
|
#{statements.shift}\n
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
if statements.present?
|
if statements.present?
|
||||||
context << <<~TEXT
|
context << <<~TEXT
|
||||||
Subsequent discussion includes the following:
|
Subsequent discussion includes the following:
|
||||||
|
|
||||||
#{statements&.join("\n")}
|
#{statements.join("\n")}
|
||||||
|
|
||||||
Your task is to focus on these latest messages, capturing their meaning in the context of the initial statement.
|
Your task is to focus on these latest messages, capturing their meaning in the context of the initial statement.
|
||||||
TEXT
|
TEXT
|
||||||
|
|
|
@ -9,12 +9,6 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
|
|
||||||
def targets_data
|
def targets_data
|
||||||
content = {
|
|
||||||
resource_path: "#{Discourse.base_path}/t/-/#{target.id}",
|
|
||||||
content_title: target.title,
|
|
||||||
contents: [],
|
|
||||||
}
|
|
||||||
|
|
||||||
posts_data =
|
posts_data =
|
||||||
(target.has_summary? ? best_replies : pick_selection).pluck(
|
(target.has_summary? ? best_replies : pick_selection).pluck(
|
||||||
:post_number,
|
:post_number,
|
||||||
|
@ -22,85 +16,102 @@ module DiscourseAi
|
||||||
:username,
|
:username,
|
||||||
)
|
)
|
||||||
|
|
||||||
posts_data.each do |(pn, raw, username)|
|
posts_data.reduce([]) do |memo, (pn, raw, username)|
|
||||||
raw_text = raw
|
raw_text = raw
|
||||||
|
|
||||||
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
||||||
raw_text = target.topic_embed&.embed_content_cache
|
raw_text = target.topic_embed&.embed_content_cache
|
||||||
end
|
end
|
||||||
|
|
||||||
content[:contents] << { poster: username, id: pn, text: raw_text }
|
memo << { poster: username, id: pn, text: raw_text }
|
||||||
end
|
end
|
||||||
|
|
||||||
content
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def concatenation_prompt(texts_to_summarize)
|
def summary_extension_prompt(summary, contents)
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
|
||||||
You are a summarization bot that effectively concatenates disjointed summaries, creating a cohesive narrative.
|
content_title = target.title
|
||||||
The narrative you create is in the form of one or multiple paragraphs.
|
input =
|
||||||
Your reply MUST BE a single concatenated summary using the summaries I'll provide to you.
|
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]})" }.join
|
||||||
I'm NOT interested in anything other than the concatenated summary, don't include additional text or comments.
|
|
||||||
You understand and generate Discourse forum Markdown.
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT)
|
||||||
You format the response, including links, using Markdown.
|
You are an advanced summarization bot tasked with enhancing an existing summary by incorporating additional posts.
|
||||||
|
|
||||||
|
### Guidelines:
|
||||||
|
- Only include the enhanced summary, without any additional commentary.
|
||||||
|
- Understand and generate Discourse forum Markdown; including links, _italics_, **bold**.
|
||||||
|
- Maintain the original language of the text being summarized.
|
||||||
|
- Aim for summaries to be 400 words or less.
|
||||||
|
- Each new post is formatted as "<POST_NUMBER>) <USERNAME> <MESSAGE>"
|
||||||
|
- Cite specific noteworthy posts using the format [NAME](#{resource_path}/POST_NUMBER)
|
||||||
|
- Example: link to the 3rd post by sam: [sam](#{resource_path}/3)
|
||||||
|
- Example: link to the 6th post by jane: [agreed with](#{resource_path}/6)
|
||||||
|
- Example: link to the 13th post by joe: [#13](#{resource_path}/13)
|
||||||
|
- When formatting usernames either use @USERNAME or [USERNAME](#{resource_path}/POST_NUMBER)
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
### Context:
|
||||||
|
|
||||||
|
#{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""}
|
||||||
|
|
||||||
|
Here is the existing summary:
|
||||||
|
|
||||||
|
#{summary}
|
||||||
|
|
||||||
|
Here are the new posts, inside <input></input> XML tags:
|
||||||
|
|
||||||
<input>
|
<input>
|
||||||
#{texts_to_summarize.join("\n")}
|
#{input}
|
||||||
</input>
|
</input>
|
||||||
|
|
||||||
|
Integrate the new information to generate an enhanced concise and coherent summary.
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
||||||
def summarize_single_prompt(input, opts)
|
def first_summary_prompt(contents)
|
||||||
insts = +<<~TEXT
|
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
|
||||||
You are an advanced summarization bot that generates concise, coherent summaries of provided text.
|
content_title = target.title
|
||||||
|
input =
|
||||||
|
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
|
||||||
|
|
||||||
- Only include the summary, without any additional commentary.
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||||
- You understand and generate Discourse forum Markdown; including links, _italics_, **bold**.
|
You are an advanced summarization bot that generates concise, coherent summaries of provided text.
|
||||||
- Maintain the original language of the text being summarized.
|
|
||||||
- Aim for summaries to be 400 words or less.
|
|
||||||
|
|
||||||
TEXT
|
- Only include the summary, without any additional commentary.
|
||||||
|
- You understand and generate Discourse forum Markdown; including links, _italics_, **bold**.
|
||||||
|
- Maintain the original language of the text being summarized.
|
||||||
|
- Aim for summaries to be 400 words or less.
|
||||||
|
- Each post is formatted as "<POST_NUMBER>) <USERNAME> <MESSAGE>"
|
||||||
|
- Cite specific noteworthy posts using the format [NAME](#{resource_path}/POST_NUMBER)
|
||||||
|
- Example: link to the 3rd post by sam: [sam](#{resource_path}/3)
|
||||||
|
- Example: link to the 6th post by jane: [agreed with](#{resource_path}/6)
|
||||||
|
- Example: link to the 13th post by joe: [#13](#{resource_path}/13)
|
||||||
|
- When formatting usernames either use @USERNMAE OR [USERNAME](#{resource_path}/POST_NUMBER)
|
||||||
|
TEXT
|
||||||
|
|
||||||
insts << <<~TEXT if opts[:resource_path]
|
prompt.push(
|
||||||
- Each post is formatted as "<POST_NUMBER>) <USERNAME> <MESSAGE>"
|
type: :user,
|
||||||
- Cite specific noteworthy posts using the format [NAME](#{opts[:resource_path]}/POST_NUMBER)
|
content:
|
||||||
- Example: link to the 3rd post by sam: [sam](#{opts[:resource_path]}/3)
|
"Here are the posts inside <input></input> XML tags:\n\n<input>1) user1 said: I love Mondays 2) user2 said: I hate Mondays</input>\n\nGenerate a concise, coherent summary of the text above maintaining the original language.",
|
||||||
- Example: link to the 6th post by jane: [agreed with](#{opts[:resource_path]}/6)
|
)
|
||||||
- Example: link to the 13th post by joe: [#13](#{opts[:resource_path]}/13)
|
prompt.push(
|
||||||
- When formatting usernames either use @USERNMAE OR [USERNAME](#{opts[:resource_path]}/POST_NUMBER)
|
type: :model,
|
||||||
TEXT
|
content:
|
||||||
|
"Two users are sharing their feelings toward Mondays. [user1](#{resource_path}/1) hates them, while [user2](#{resource_path}/2) loves them.",
|
||||||
prompt = DiscourseAi::Completions::Prompt.new(insts.strip)
|
)
|
||||||
|
|
||||||
if opts[:resource_path]
|
|
||||||
prompt.push(
|
|
||||||
type: :user,
|
|
||||||
content:
|
|
||||||
"Here are the posts inside <input></input> XML tags:\n\n<input>1) user1 said: I love Mondays 2) user2 said: I hate Mondays</input>\n\nGenerate a concise, coherent summary of the text above maintaining the original language.",
|
|
||||||
)
|
|
||||||
prompt.push(
|
|
||||||
type: :model,
|
|
||||||
content:
|
|
||||||
"Two users are sharing their feelings toward Mondays. [user1](#{opts[:resource_path]}/1) hates them, while [user2](#{opts[:resource_path]}/2) loves them.",
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||||
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
|
#{content_title.present? ? "The discussion title is: " + content_title + ".\n" : ""}
|
||||||
Here are the posts, inside <input></input> XML tags:
|
Here are the posts, inside <input></input> XML tags:
|
||||||
|
|
||||||
<input>
|
<input>
|
||||||
#{input}
|
#{input}
|
||||||
</input>
|
</input>
|
||||||
|
|
||||||
Generate a concise, coherent summary of the text above maintaining the original language.
|
Generate a concise, coherent summary of the text above maintaining the original language.
|
||||||
TEXT
|
TEXT
|
||||||
|
|
||||||
prompt
|
prompt
|
||||||
end
|
end
|
||||||
|
|
|
@ -40,14 +40,12 @@ module DiscourseAi
|
||||||
tokenizer.decode(tokenizer.encode(text).ids.take(max_length))
|
tokenizer.decode(tokenizer.encode(text).ids.take(max_length))
|
||||||
end
|
end
|
||||||
|
|
||||||
def can_expand_tokens?(text, addition, max_length)
|
def below_limit?(text, limit)
|
||||||
# fast track common case, /2 to handle unicode chars
|
# fast track common case, /2 to handle unicode chars
|
||||||
# than can take more than 1 token per char
|
# than can take more than 1 token per char
|
||||||
if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
|
return true if !SiteSetting.ai_strict_token_counting && text.size < limit / 2
|
||||||
return true
|
|
||||||
end
|
|
||||||
|
|
||||||
tokenizer.encode(text).ids.length + tokenizer.encode(addition).ids.length < max_length
|
tokenizer.encode(text).ids.length < limit
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -31,14 +31,12 @@ module DiscourseAi
|
||||||
retry
|
retry
|
||||||
end
|
end
|
||||||
|
|
||||||
def can_expand_tokens?(text, addition, max_length)
|
def below_limit?(text, limit)
|
||||||
# fast track common case, /2 to handle unicode chars
|
# fast track common case, /2 to handle unicode chars
|
||||||
# than can take more than 1 token per char
|
# than can take more than 1 token per char
|
||||||
if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
|
return true if !SiteSetting.ai_strict_token_counting && text.size < limit / 2
|
||||||
return true
|
|
||||||
end
|
|
||||||
|
|
||||||
tokenizer.encode(text).length + tokenizer.encode(addition).length < max_length
|
tokenizer.encode(text).length < limit
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -15,12 +15,15 @@ RSpec.describe DiscourseAi::Summarization::FoldContent do
|
||||||
# Make sure each content fits in a single chunk.
|
# Make sure each content fits in a single chunk.
|
||||||
# 700 is the number of tokens reserved for the prompt.
|
# 700 is the number of tokens reserved for the prompt.
|
||||||
model_tokens =
|
model_tokens =
|
||||||
700 + DiscourseAi::Tokenizer::OpenAiTokenizer.size("(1 asd said: This is a text ") + 3
|
700 +
|
||||||
|
DiscourseAi::Tokenizer::OpenAiTokenizer.size(
|
||||||
|
"(1 #{post_1.user.username_lower} said: This is a text ",
|
||||||
|
) + 3
|
||||||
|
|
||||||
llm_model.update!(max_prompt_tokens: model_tokens)
|
llm_model.update!(max_prompt_tokens: model_tokens)
|
||||||
end
|
end
|
||||||
|
|
||||||
let(:single_summary) { "this is a single summary" }
|
let(:single_summary) { "single" }
|
||||||
let(:concatenated_summary) { "this is a concatenated summary" }
|
let(:concatenated_summary) { "this is a concatenated summary" }
|
||||||
|
|
||||||
let(:user) { User.new }
|
let(:user) { User.new }
|
||||||
|
@ -39,29 +42,11 @@ RSpec.describe DiscourseAi::Summarization::FoldContent do
|
||||||
context "when the content to summarize doesn't fit in a single call" do
|
context "when the content to summarize doesn't fit in a single call" do
|
||||||
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2, raw: "This is a text") }
|
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2, raw: "This is a text") }
|
||||||
|
|
||||||
it "summarizes each chunk and then concatenates them" do
|
it "keeps extending the summary until there is nothing else to process" do
|
||||||
result =
|
result =
|
||||||
DiscourseAi::Completions::Llm.with_prepared_responses(
|
DiscourseAi::Completions::Llm.with_prepared_responses(
|
||||||
[single_summary, single_summary, concatenated_summary],
|
[single_summary, concatenated_summary],
|
||||||
) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(3) } }
|
) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(2) } }
|
||||||
|
|
||||||
expect(result.summarized_text).to eq(concatenated_summary)
|
|
||||||
end
|
|
||||||
|
|
||||||
it "keeps splitting into chunks until the content fits into a single call to create a cohesive narrative" do
|
|
||||||
max_length_response = "(1 asd said: This is a text "
|
|
||||||
chunk_of_chunks = "I'm smol"
|
|
||||||
|
|
||||||
result =
|
|
||||||
DiscourseAi::Completions::Llm.with_prepared_responses(
|
|
||||||
[
|
|
||||||
max_length_response,
|
|
||||||
max_length_response,
|
|
||||||
chunk_of_chunks,
|
|
||||||
chunk_of_chunks,
|
|
||||||
concatenated_summary,
|
|
||||||
],
|
|
||||||
) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(5) } }
|
|
||||||
|
|
||||||
expect(result.summarized_text).to eq(concatenated_summary)
|
expect(result.summarized_text).to eq(concatenated_summary)
|
||||||
end
|
end
|
||||||
|
|
|
@ -12,7 +12,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do
|
||||||
post_2.update(created_at: (SiteSetting.hot_topics_recent_days + 1).days.ago)
|
post_2.update(created_at: (SiteSetting.hot_topics_recent_days + 1).days.ago)
|
||||||
Fabricate(:post, topic: topic, post_number: 3)
|
Fabricate(:post, topic: topic, post_number: 3)
|
||||||
|
|
||||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = gist.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1, 3)
|
expect(post_numbers).to contain_exactly(1, 3)
|
||||||
end
|
end
|
||||||
|
@ -20,7 +20,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do
|
||||||
it "only includes visible posts" do
|
it "only includes visible posts" do
|
||||||
post_2.update!(hidden: true)
|
post_2.update!(hidden: true)
|
||||||
|
|
||||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = gist.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1)
|
expect(post_numbers).to contain_exactly(1)
|
||||||
end
|
end
|
||||||
|
@ -28,7 +28,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do
|
||||||
it "doesn't include posts without users" do
|
it "doesn't include posts without users" do
|
||||||
post_2.update!(user_id: nil)
|
post_2.update!(user_id: nil)
|
||||||
|
|
||||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = gist.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1)
|
expect(post_numbers).to contain_exactly(1)
|
||||||
end
|
end
|
||||||
|
@ -36,7 +36,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do
|
||||||
it "doesn't include whispers" do
|
it "doesn't include whispers" do
|
||||||
post_2.update!(post_type: Post.types[:whisper])
|
post_2.update!(post_type: Post.types[:whisper])
|
||||||
|
|
||||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = gist.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1)
|
expect(post_numbers).to contain_exactly(1)
|
||||||
end
|
end
|
||||||
|
@ -51,8 +51,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do
|
||||||
)
|
)
|
||||||
|
|
||||||
content = gist.targets_data
|
content = gist.targets_data
|
||||||
|
op_content = content.first[:text]
|
||||||
op_content = content[:contents].first[:text]
|
|
||||||
|
|
||||||
expect(op_content).to include(topic_embed.embed_content_cache)
|
expect(op_content).to include(topic_embed.embed_content_cache)
|
||||||
end
|
end
|
||||||
|
|
|
@ -12,7 +12,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do
|
||||||
it "only includes visible posts" do
|
it "only includes visible posts" do
|
||||||
post_2.update!(hidden: true)
|
post_2.update!(hidden: true)
|
||||||
|
|
||||||
post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = topic_summary.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1)
|
expect(post_numbers).to contain_exactly(1)
|
||||||
end
|
end
|
||||||
|
@ -20,7 +20,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do
|
||||||
it "doesn't include posts without users" do
|
it "doesn't include posts without users" do
|
||||||
post_2.update!(user_id: nil)
|
post_2.update!(user_id: nil)
|
||||||
|
|
||||||
post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = topic_summary.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1)
|
expect(post_numbers).to contain_exactly(1)
|
||||||
end
|
end
|
||||||
|
@ -28,7 +28,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do
|
||||||
it "doesn't include whispers" do
|
it "doesn't include whispers" do
|
||||||
post_2.update!(post_type: Post.types[:whisper])
|
post_2.update!(post_type: Post.types[:whisper])
|
||||||
|
|
||||||
post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] }
|
post_numbers = topic_summary.targets_data.map { |c| c[:id] }
|
||||||
|
|
||||||
expect(post_numbers).to contain_exactly(1)
|
expect(post_numbers).to contain_exactly(1)
|
||||||
end
|
end
|
||||||
|
@ -56,8 +56,7 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do
|
||||||
)
|
)
|
||||||
|
|
||||||
content = topic_summary.targets_data
|
content = topic_summary.targets_data
|
||||||
|
op_content = content.first[:text]
|
||||||
op_content = content[:contents].first[:text]
|
|
||||||
|
|
||||||
expect(op_content).to include(topic_embed.embed_content_cache)
|
expect(op_content).to include(topic_embed.embed_content_cache)
|
||||||
end
|
end
|
||||||
|
|
|
@ -90,21 +90,21 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "#can_expand_tokens?" do
|
describe "#below_limit?" do
|
||||||
it "returns true when the tokens can be expanded" do
|
it "returns true when the tokens can be expanded" do
|
||||||
expect(described_class.can_expand_tokens?("foo bar", "baz qux", 6)).to eq(true)
|
expect(described_class.below_limit?("foo bar baz qux", 6)).to eq(true)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "returns false when the tokens cannot be expanded" do
|
it "returns false when the tokens cannot be expanded" do
|
||||||
expect(described_class.can_expand_tokens?("foo bar", "baz qux", 3)).to eq(false)
|
expect(described_class.below_limit?("foo bar baz qux", 3)).to eq(false)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do
|
it "returns false when the tokens cannot be expanded due to multibyte unicode characters" do
|
||||||
expect(described_class.can_expand_tokens?("foo bar 👨🏿", "baz qux", 6)).to eq(false)
|
expect(described_class.below_limit?("foo bar 👨🏿 baz qux", 6)).to eq(false)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "handles unicode characters properly when they use more than one token per char" do
|
it "handles unicode characters properly when they use more than one token per char" do
|
||||||
expect(described_class.can_expand_tokens?("我喜欢吃比萨", "萨", 10)).to eq(false)
|
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue