REFACTOR: Support of different summarization targets/prompts. (#835)
* DEV: Add summary types * Refactor for different summary types * Use enum for summary types * Update lib/summarization/strategies/topic_summary.rb Co-authored-by: Penar Musaraj <pmusaraj@gmail.com> * Update lib/summarization/strategies/topic_gist.rb Co-authored-by: Penar Musaraj <pmusaraj@gmail.com> * Update lib/summarization/strategies/chat_messages.rb Co-authored-by: Penar Musaraj <pmusaraj@gmail.com> * Fix chat_messages single prompt * Small tweak to the chat summarization prompt --------- Co-authored-by: Penar Musaraj <pmusaraj@gmail.com>
This commit is contained in:
parent
791fad1e6a
commit
c7acb4a6a0
|
@ -15,29 +15,21 @@ module DiscourseAi
|
|||
channel = ::Chat::Channel.find(params[:channel_id])
|
||||
guardian.ensure_can_join_chat_channel!(channel)
|
||||
|
||||
strategy = DiscourseAi::Summarization.default_strategy
|
||||
raise Discourse::NotFound.new unless strategy
|
||||
summarizer = DiscourseAi::Summarization.chat_channel_summary(channel, since)
|
||||
raise Discourse::NotFound.new unless summarizer
|
||||
|
||||
guardian.ensure_can_request_summary!
|
||||
|
||||
RateLimiter.new(current_user, "channel_summary", 6, 5.minutes).performed!
|
||||
|
||||
hijack do
|
||||
content = { content_title: channel.name }
|
||||
|
||||
content[:contents] = channel
|
||||
.chat_messages
|
||||
.where("chat_messages.created_at > ?", since.hours.ago)
|
||||
.includes(:user)
|
||||
.order(created_at: :asc)
|
||||
.pluck(:id, :username_lower, :message)
|
||||
.map { { id: _1, poster: _2, text: _3 } }
|
||||
strategy = DiscourseAi::Summarization::Strategies::ChatMessages.new(channel, since)
|
||||
|
||||
summarized_text =
|
||||
if content[:contents].empty?
|
||||
if strategy.targets_data[:contents].empty?
|
||||
I18n.t("discourse_ai.summarization.chat.no_targets")
|
||||
else
|
||||
strategy.summarize(content, current_user).dig(:summary)
|
||||
summarizer.summarize(current_user)&.summarized_text
|
||||
end
|
||||
|
||||
render json: { summary: summarized_text }
|
||||
|
|
|
@ -9,15 +9,19 @@ module DiscourseAi
|
|||
topic = Topic.find(params[:topic_id])
|
||||
guardian.ensure_can_see!(topic)
|
||||
|
||||
raise Discourse::NotFound if !guardian.can_see_summary?(topic)
|
||||
if !guardian.can_see_summary?(topic, AiSummary.summary_types[:complete])
|
||||
raise Discourse::NotFound
|
||||
end
|
||||
|
||||
RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! if current_user
|
||||
|
||||
opts = params.permit(:skip_age_check)
|
||||
skip_age_check = opts[:skip_age_check] == "true"
|
||||
|
||||
summarization_service = DiscourseAi::TopicSummarization.for(topic, current_user)
|
||||
|
||||
if params[:stream] && current_user
|
||||
cached_summary = DiscourseAi::TopicSummarization.cached_summary(topic, current_user)
|
||||
cached_summary = summarization_service.cached_summary
|
||||
|
||||
if cached_summary && !skip_age_check
|
||||
render_serialized(cached_summary, AiTopicSummarySerializer)
|
||||
|
@ -34,12 +38,7 @@ module DiscourseAi
|
|||
render json: success_json
|
||||
else
|
||||
hijack do
|
||||
summary =
|
||||
DiscourseAi::TopicSummarization.summarize(
|
||||
topic,
|
||||
current_user,
|
||||
skip_age_check: skip_age_check,
|
||||
)
|
||||
summary = summarization_service.summarize(skip_age_check: skip_age_check)
|
||||
render_serialized(summary, AiTopicSummarySerializer)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -8,8 +8,11 @@ module Jobs
|
|||
return unless topic = Topic.find_by(id: args[:topic_id])
|
||||
return unless user = User.find_by(id: args[:user_id])
|
||||
|
||||
strategy = DiscourseAi::Summarization.default_strategy
|
||||
return if strategy.nil? || !Guardian.new(user).can_see_summary?(topic)
|
||||
strategy = DiscourseAi::Summarization.topic_summary(topic)
|
||||
if strategy.nil? ||
|
||||
!Guardian.new(user).can_see_summary?(topic, AiSummary.summary_types[:complete])
|
||||
return
|
||||
end
|
||||
|
||||
guardian = Guardian.new(user)
|
||||
return unless guardian.can_see?(topic)
|
||||
|
@ -21,7 +24,7 @@ module Jobs
|
|||
|
||||
summary =
|
||||
DiscourseAi::TopicSummarization
|
||||
.new(strategy, topic, user)
|
||||
.new(strategy, user)
|
||||
.summarize(skip_age_check: skip_age_check) do |partial_summary|
|
||||
streamed_summary << partial_summary
|
||||
|
||||
|
|
|
@ -3,6 +3,23 @@
|
|||
class AiSummary < ActiveRecord::Base
|
||||
belongs_to :target, polymorphic: true
|
||||
|
||||
enum :summary_type, { complete: 0, gist: 1 }
|
||||
|
||||
def self.store!(target, summary_type, model, summary, content_ids)
|
||||
AiSummary.create!(
|
||||
target: target,
|
||||
algorithm: model,
|
||||
content_range: (content_ids.first..content_ids.last),
|
||||
summarized_text: summary,
|
||||
original_content_sha: build_sha(content_ids.join),
|
||||
summary_type: summary_type,
|
||||
)
|
||||
end
|
||||
|
||||
def self.build_sha(joined_ids)
|
||||
Digest::SHA256.hexdigest(joined_ids)
|
||||
end
|
||||
|
||||
def mark_as_outdated
|
||||
@outdated = true
|
||||
end
|
||||
|
@ -25,6 +42,7 @@ end
|
|||
# algorithm :string not null
|
||||
# created_at :datetime not null
|
||||
# updated_at :datetime not null
|
||||
# summary_type :string default("complete"), not null
|
||||
#
|
||||
# Indexes
|
||||
#
|
||||
|
|
|
@ -1,145 +1,44 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
# A cache layer on top of our topic summarization engine. Also handle permissions.
|
||||
class TopicSummarization
|
||||
def self.summarize(topic, user, skip_age_check: false, &on_partial_blk)
|
||||
new(DiscourseAi::Summarization.default_strategy, topic, user).summarize(
|
||||
skip_age_check: skip_age_check,
|
||||
&on_partial_blk
|
||||
)
|
||||
def self.for(topic, user)
|
||||
new(DiscourseAi::Summarization.topic_summary(topic), user)
|
||||
end
|
||||
|
||||
def self.cached_summary(topic, user)
|
||||
new(DiscourseAi::Summarization.default_strategy, topic, user).cached_summary
|
||||
end
|
||||
|
||||
def initialize(strategy, topic, user)
|
||||
@strategy = strategy
|
||||
@topic = topic
|
||||
def initialize(summarizer, user)
|
||||
@summarizer = summarizer
|
||||
@user = user
|
||||
end
|
||||
|
||||
attr_reader :strategy, :topic, :user
|
||||
|
||||
def cached_summary
|
||||
existing_summary
|
||||
summarizer.existing_summary
|
||||
end
|
||||
|
||||
def summarize(skip_age_check: false, &on_partial_blk)
|
||||
# Existing summary shouldn't be nil in this scenario because the controller checks its existence.
|
||||
return if !user && !existing_summary
|
||||
return if !user && !cached_summary
|
||||
|
||||
return existing_summary if use_cached?(skip_age_check)
|
||||
return cached_summary if use_cached?(skip_age_check)
|
||||
|
||||
delete_cached_summaries! if existing_summary
|
||||
summarizer.delete_cached_summaries! if cached_summary
|
||||
|
||||
content = {
|
||||
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
|
||||
content_title: topic.title,
|
||||
contents: [],
|
||||
}
|
||||
|
||||
summary_targets_data.map do |(pn, raw, username)|
|
||||
raw_text = raw
|
||||
|
||||
if pn == 1 && topic.topic_embed&.embed_content_cache.present?
|
||||
raw_text = topic.topic_embed&.embed_content_cache
|
||||
end
|
||||
|
||||
content[:contents] << { poster: username, id: pn, text: raw_text }
|
||||
end
|
||||
|
||||
summarization_result = strategy.summarize(content, user, &on_partial_blk)
|
||||
cache_summary(summarization_result)
|
||||
end
|
||||
|
||||
def summary_targets
|
||||
topic.has_summary? ? best_replies : pick_selection
|
||||
summarizer.summarize(user, &on_partial_blk)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def summary_sha
|
||||
@summary_sha ||= build_sha(summary_targets_data.map(&:first))
|
||||
end
|
||||
|
||||
def summary_targets_data
|
||||
@summary_targets_data ||= summary_targets.pluck(:post_number, :raw, :username)
|
||||
end
|
||||
|
||||
def existing_summary
|
||||
if !defined?(@existing_summary)
|
||||
@existing_summary = AiSummary.find_by(target: topic)
|
||||
if @existing_summary && existing_summary.original_content_sha != summary_sha
|
||||
@existing_summary.mark_as_outdated
|
||||
end
|
||||
end
|
||||
@existing_summary
|
||||
end
|
||||
|
||||
def best_replies
|
||||
Post
|
||||
.summary(topic.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.joins(:user)
|
||||
.order(:post_number)
|
||||
end
|
||||
|
||||
def pick_selection
|
||||
posts =
|
||||
Post
|
||||
.where(topic_id: topic.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.order(:post_number)
|
||||
|
||||
post_numbers = posts.limit(5).pluck(:post_number)
|
||||
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
|
||||
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
|
||||
|
||||
Post
|
||||
.where(topic_id: topic.id)
|
||||
.joins(:user)
|
||||
.where("post_number in (?)", post_numbers)
|
||||
.order(:post_number)
|
||||
end
|
||||
|
||||
def delete_cached_summaries!
|
||||
AiSummary.where(target: topic).destroy_all
|
||||
end
|
||||
attr_reader :summarizer, :user
|
||||
|
||||
def use_cached?(skip_age_check)
|
||||
can_summarize = Guardian.new(user).can_request_summary?
|
||||
|
||||
existing_summary &&
|
||||
cached_summary &&
|
||||
!(
|
||||
can_summarize && new_targets? &&
|
||||
(skip_age_check || existing_summary.created_at < 1.hour.ago)
|
||||
can_summarize && cached_summary.outdated &&
|
||||
(skip_age_check || cached_summary.created_at < 1.hour.ago)
|
||||
)
|
||||
end
|
||||
|
||||
def new_targets?
|
||||
existing_summary&.original_content_sha != summary_sha
|
||||
end
|
||||
|
||||
def cache_summary(result)
|
||||
post_numbers = summary_targets_data.map(&:first)
|
||||
|
||||
cached_summary =
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
algorithm: strategy.display_name,
|
||||
content_range: (post_numbers.first..post_numbers.last),
|
||||
summarized_text: result[:summary],
|
||||
original_content_sha: summary_sha,
|
||||
)
|
||||
|
||||
cached_summary
|
||||
end
|
||||
|
||||
def build_sha(ids)
|
||||
Digest::SHA256.hexdigest(ids.join)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# frozen_string_literal: true
|
||||
class AddAiSummaryTypeColumn < ActiveRecord::Migration[7.1]
|
||||
def change
|
||||
add_column :ai_summaries, :summary_type, :integer, default: 0, null: false
|
||||
end
|
||||
end
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
module DiscourseAi
|
||||
module GuardianExtensions
|
||||
def can_see_summary?(target)
|
||||
def can_see_summary?(target, summary_type)
|
||||
return false if !SiteSetting.ai_summarization_enabled
|
||||
|
||||
if target.class == Topic && target.private_message?
|
||||
|
@ -14,7 +14,7 @@ module DiscourseAi
|
|||
return false if !allowed
|
||||
end
|
||||
|
||||
has_cached_summary = AiSummary.exists?(target: target)
|
||||
has_cached_summary = AiSummary.exists?(target: target, summary_type: summary_type)
|
||||
return has_cached_summary if user.nil?
|
||||
|
||||
has_cached_summary || can_request_summary?
|
||||
|
|
|
@ -1,9 +1,36 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
def self.default_strategy
|
||||
def self.topic_summary(topic)
|
||||
if SiteSetting.ai_summarization_model.present? && SiteSetting.ai_summarization_enabled
|
||||
DiscourseAi::Summarization::Strategies::FoldContent.new(SiteSetting.ai_summarization_model)
|
||||
DiscourseAi::Summarization::FoldContent.new(
|
||||
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_summarization_model),
|
||||
DiscourseAi::Summarization::Strategies::TopicSummary.new(topic),
|
||||
)
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def self.topic_gist(topic)
|
||||
if SiteSetting.ai_summarization_model.present? && SiteSetting.ai_summarization_enabled
|
||||
DiscourseAi::Summarization::FoldContent.new(
|
||||
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_summarization_model),
|
||||
DiscourseAi::Summarization::Strategies::TopicGist.new(topic),
|
||||
)
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def self.chat_channel_summary(channel, time_window_in_hours)
|
||||
if SiteSetting.ai_summarization_model.present? && SiteSetting.ai_summarization_enabled
|
||||
DiscourseAi::Summarization::FoldContent.new(
|
||||
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_summarization_model),
|
||||
DiscourseAi::Summarization::Strategies::ChatMessages.new(channel, time_window_in_hours),
|
||||
persist_summaries: false,
|
||||
)
|
||||
else
|
||||
nil
|
||||
end
|
||||
|
|
|
@ -2,14 +2,6 @@
|
|||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
def self.default_strategy
|
||||
if SiteSetting.ai_summarization_model.present? && SiteSetting.ai_summarization_enabled
|
||||
DiscourseAi::Summarization::Strategies::FoldContent.new(SiteSetting.ai_summarization_model)
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
class EntryPoint
|
||||
def inject_into(plugin)
|
||||
plugin.add_to_serializer(:current_user, :can_summarize) do
|
||||
|
@ -18,11 +10,11 @@ module DiscourseAi
|
|||
end
|
||||
|
||||
plugin.add_to_serializer(:topic_view, :summarizable) do
|
||||
scope.can_see_summary?(object.topic)
|
||||
scope.can_see_summary?(object.topic, AiSummary.summary_types[:complete])
|
||||
end
|
||||
|
||||
plugin.add_to_serializer(:web_hook_topic_view, :summarizable) do
|
||||
scope.can_see_summary?(object.topic)
|
||||
scope.can_see_summary?(object.topic, AiSummary.summary_types[:complete])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
# This class offers a generic way of summarizing content from multiple sources using different prompts.
|
||||
#
|
||||
# It summarizes large amounts of content by recursively summarizing it in smaller chunks that
|
||||
# fit the given model context window, finally concatenating the disjoint summaries
|
||||
# into a final version.
|
||||
#
|
||||
class FoldContent
|
||||
def initialize(llm, strategy, persist_summaries: true)
|
||||
@llm = llm
|
||||
@strategy = strategy
|
||||
@persist_summaries = persist_summaries
|
||||
end
|
||||
|
||||
attr_reader :llm, :strategy
|
||||
|
||||
# @param user { User } - User object used for auditing usage.
|
||||
#
|
||||
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
||||
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
||||
#
|
||||
# @returns { AiSummary } - Resulting summary.
|
||||
def summarize(user, &on_partial_blk)
|
||||
opts = content_to_summarize.except(:contents)
|
||||
|
||||
initial_chunks =
|
||||
rebalance_chunks(
|
||||
content_to_summarize[:contents].map do |c|
|
||||
{ ids: [c[:id]], summary: format_content_item(c) }
|
||||
end,
|
||||
)
|
||||
|
||||
# Special case where we can do all the summarization in one pass.
|
||||
result =
|
||||
if initial_chunks.length == 1
|
||||
{
|
||||
summary:
|
||||
summarize_single(initial_chunks.first[:summary], user, opts, &on_partial_blk),
|
||||
chunks: [],
|
||||
}
|
||||
else
|
||||
summarize_chunks(initial_chunks, user, opts, &on_partial_blk)
|
||||
end
|
||||
|
||||
if persist_summaries
|
||||
AiSummary.store!(
|
||||
strategy.target,
|
||||
strategy.type,
|
||||
llm_model.name,
|
||||
result[:summary],
|
||||
content_to_summarize[:contents].map { |c| c[:id] },
|
||||
)
|
||||
else
|
||||
AiSummary.new(summarized_text: result[:summary])
|
||||
end
|
||||
end
|
||||
|
||||
# @returns { AiSummary } - Resulting summary.
|
||||
#
|
||||
# Finds a summary matching the target and strategy. Marks it as outdates if the strategy found newer content
|
||||
def existing_summary
|
||||
if !defined?(@existing_summary)
|
||||
summary = AiSummary.find_by(target: strategy.target, summary_type: strategy.type)
|
||||
|
||||
if summary
|
||||
@existing_summary = summary
|
||||
|
||||
if existing_summary.original_content_sha != latest_sha
|
||||
@existing_summary.mark_as_outdated
|
||||
end
|
||||
end
|
||||
end
|
||||
@existing_summary
|
||||
end
|
||||
|
||||
def delete_cached_summaries!
|
||||
AiSummary.where(target: strategy.target, summary_type: strategy.type).destroy_all
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
attr_reader :persist_summaries
|
||||
|
||||
def llm_model
|
||||
llm.llm_model
|
||||
end
|
||||
|
||||
def content_to_summarize
|
||||
@targets_data ||= strategy.targets_data
|
||||
end
|
||||
|
||||
def latest_sha
|
||||
@latest_sha ||= AiSummary.build_sha(content_to_summarize[:contents].map { |c| c[:id] }.join)
|
||||
end
|
||||
|
||||
def summarize_chunks(chunks, user, opts, &on_partial_blk)
|
||||
# Safely assume we always have more than one chunk.
|
||||
summarized_chunks = summarize_in_chunks(chunks, user, opts)
|
||||
total_summaries_size =
|
||||
llm_model.tokenizer_class.size(summarized_chunks.map { |s| s[:summary].to_s }.join)
|
||||
|
||||
if total_summaries_size < available_tokens
|
||||
# Chunks are small enough, we can concatenate them.
|
||||
{
|
||||
summary:
|
||||
concatenate_summaries(
|
||||
summarized_chunks.map { |s| s[:summary] },
|
||||
user,
|
||||
&on_partial_blk
|
||||
),
|
||||
chunks: summarized_chunks,
|
||||
}
|
||||
else
|
||||
# We have summarized chunks but we can't concatenate them yet. Split them into smaller summaries and summarize again.
|
||||
rebalanced_chunks = rebalance_chunks(summarized_chunks)
|
||||
|
||||
summarize_chunks(rebalanced_chunks, user, opts, &on_partial_blk)
|
||||
end
|
||||
end
|
||||
|
||||
def format_content_item(item)
|
||||
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
||||
end
|
||||
|
||||
def rebalance_chunks(chunks)
|
||||
section = { ids: [], summary: "" }
|
||||
|
||||
chunks =
|
||||
chunks.reduce([]) do |sections, chunk|
|
||||
if llm_model.tokenizer_class.can_expand_tokens?(
|
||||
section[:summary],
|
||||
chunk[:summary],
|
||||
available_tokens,
|
||||
)
|
||||
section[:summary] += chunk[:summary]
|
||||
section[:ids] = section[:ids].concat(chunk[:ids])
|
||||
else
|
||||
sections << section
|
||||
section = chunk
|
||||
end
|
||||
|
||||
sections
|
||||
end
|
||||
|
||||
chunks << section if section[:summary].present?
|
||||
|
||||
chunks
|
||||
end
|
||||
|
||||
def summarize_single(text, user, opts, &on_partial_blk)
|
||||
prompt = strategy.summarize_single_prompt(text, opts)
|
||||
|
||||
llm.generate(prompt, user: user, feature_name: "summarize", &on_partial_blk)
|
||||
end
|
||||
|
||||
def summarize_in_chunks(chunks, user, opts)
|
||||
chunks.map do |chunk|
|
||||
prompt = strategy.summarize_single_prompt(chunk[:summary], opts)
|
||||
|
||||
chunk[:summary] = llm.generate(
|
||||
prompt,
|
||||
user: user,
|
||||
max_tokens: 300,
|
||||
feature_name: "summarize",
|
||||
)
|
||||
|
||||
chunk
|
||||
end
|
||||
end
|
||||
|
||||
def concatenate_summaries(texts_to_summarize, user, &on_partial_blk)
|
||||
prompt = strategy.concatenation_prompt(texts_to_summarize)
|
||||
|
||||
llm.generate(prompt, user: user, &on_partial_blk)
|
||||
end
|
||||
|
||||
def available_tokens
|
||||
# Reserve tokens for the response and the base prompt
|
||||
# ~500 words
|
||||
reserved_tokens = 700
|
||||
|
||||
llm_model.max_prompt_tokens - reserved_tokens
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,59 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
module Strategies
|
||||
# Objects inheriting from this class will get passed as a dependency to `DiscourseAi::Summarization::FoldContent`.
|
||||
# This collaborator knows how to source the content to summarize and the prompts used in the process,
|
||||
# one for summarizing a chunk and another for concatenating them if necessary.
|
||||
class Base
|
||||
def initialize(target)
|
||||
@target = target
|
||||
end
|
||||
|
||||
attr_reader :target
|
||||
|
||||
# The summary type differentiates instances of `AiSummary` pointing to a single target.
|
||||
# See the `summary_type` enum for available options.
|
||||
def type
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# @returns { Hash } - Content to summarize.
|
||||
#
|
||||
# This method returns a hash with the content to summarize and additional information.
|
||||
# The only mandatory key is `contents`, which must be an array of hashes with
|
||||
# the following structure:
|
||||
#
|
||||
# {
|
||||
# poster: A way to tell who write the content,
|
||||
# id: A number to signal order,
|
||||
# text: Text to summarize
|
||||
# }
|
||||
#
|
||||
# Additionally, you could add more context, which will be available in the prompt. e.g.:
|
||||
#
|
||||
# {
|
||||
# resource_path: "#{Discourse.base_path}/t/-/#{target.id}",
|
||||
# content_title: target.title,
|
||||
# contents: [...]
|
||||
# }
|
||||
#
|
||||
def targets_data
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when concatenating multiple chunks.
|
||||
def contatenation_prompt(_texts_to_summarize)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM on each chunk,
|
||||
# and when the whole content fits in one call.
|
||||
def summarize_single_prompt(_input, _opts)
|
||||
raise NotImplementedError
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,85 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
module Strategies
|
||||
class ChatMessages < Base
|
||||
def type
|
||||
AiSummary.summary_types[:complete]
|
||||
end
|
||||
|
||||
def initialize(target, since)
|
||||
super(target)
|
||||
@since = since
|
||||
end
|
||||
|
||||
def targets_data
|
||||
content = { content_title: target.name }
|
||||
|
||||
content[:contents] = target
|
||||
.chat_messages
|
||||
.where("chat_messages.created_at > ?", since.hours.ago)
|
||||
.includes(:user)
|
||||
.order(created_at: :asc)
|
||||
.pluck(:id, :username_lower, :message)
|
||||
.map { { id: _1, poster: _2, text: _3 } }
|
||||
|
||||
content
|
||||
end
|
||||
|
||||
def contatenation_prompt(texts_to_summarize)
|
||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||
You are a summarization bot tasked with creating a cohesive narrative by intelligently merging multiple disjointed summaries.
|
||||
Your response should consist of well-structured paragraphs that combines these summaries into a clear and comprehensive overview.
|
||||
Avoid adding any additional text or commentary. Format your output using Discourse forum Markdown.
|
||||
TEXT
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{texts_to_summarize.join("\n")}
|
||||
</input>
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
|
||||
def summarize_single_prompt(input, opts)
|
||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||
You are a summarization bot designed to generate clear and insightful paragraphs that conveys the main topics
|
||||
and developments from a series of chat messages within a user-selected time window.
|
||||
|
||||
Analyze the messages to extract key themes, participants' intentions, and any significant conclusions or decisions.
|
||||
Your summary should be concise yet comprehensive, providing an overview that is accessible to someone with no prior context of the conversation.
|
||||
|
||||
- Only include the summary, WITHOUT additional commentary.
|
||||
- Don't mention the channel title. Avoid including extraneous details or subjective opinions.
|
||||
- Maintain the original language of the text being summarized.
|
||||
- The same user could write multiple messages in a row, don't treat them as different persons.
|
||||
- Aim for summaries to be 400 words or less.
|
||||
|
||||
TEXT
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
#{opts[:content_title].present? ? "The name of the channel is: " + opts[:content_title] + ".\n" : ""}
|
||||
|
||||
Here are the messages, inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{input}
|
||||
</input>
|
||||
|
||||
Generate a summary of the given chat messages.
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
attr_reader :since
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,198 +0,0 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
module Strategies
|
||||
class FoldContent
|
||||
def initialize(completion_model)
|
||||
@llm = DiscourseAi::Completions::Llm.proxy(completion_model)
|
||||
raise "Invalid model provided for summarization strategy" if @llm.llm_model.nil?
|
||||
end
|
||||
|
||||
attr_reader :llm
|
||||
|
||||
def summarize(content, user, &on_partial_blk)
|
||||
opts = content.except(:contents)
|
||||
|
||||
initial_chunks =
|
||||
rebalance_chunks(
|
||||
content[:contents].map { |c| { ids: [c[:id]], summary: format_content_item(c) } },
|
||||
)
|
||||
|
||||
# Special case where we can do all the summarization in one pass.
|
||||
if initial_chunks.length == 1
|
||||
{
|
||||
summary:
|
||||
summarize_single(initial_chunks.first[:summary], user, opts, &on_partial_blk),
|
||||
chunks: [],
|
||||
}
|
||||
else
|
||||
summarize_chunks(initial_chunks, user, opts, &on_partial_blk)
|
||||
end
|
||||
end
|
||||
|
||||
def display_name
|
||||
llm_model&.name || "unknown model"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def llm_model
|
||||
llm.llm_model
|
||||
end
|
||||
|
||||
def summarize_chunks(chunks, user, opts, &on_partial_blk)
|
||||
# Safely assume we always have more than one chunk.
|
||||
summarized_chunks = summarize_in_chunks(chunks, user, opts)
|
||||
total_summaries_size =
|
||||
llm_model.tokenizer_class.size(summarized_chunks.map { |s| s[:summary].to_s }.join)
|
||||
|
||||
if total_summaries_size < available_tokens
|
||||
# Chunks are small enough, we can concatenate them.
|
||||
{
|
||||
summary:
|
||||
concatenate_summaries(
|
||||
summarized_chunks.map { |s| s[:summary] },
|
||||
user,
|
||||
&on_partial_blk
|
||||
),
|
||||
chunks: summarized_chunks,
|
||||
}
|
||||
else
|
||||
# We have summarized chunks but we can't concatenate them yet. Split them into smaller summaries and summarize again.
|
||||
rebalanced_chunks = rebalance_chunks(summarized_chunks)
|
||||
|
||||
summarize_chunks(rebalanced_chunks, user, opts, &on_partial_blk)
|
||||
end
|
||||
end
|
||||
|
||||
def format_content_item(item)
|
||||
"(#{item[:id]} #{item[:poster]} said: #{item[:text]} "
|
||||
end
|
||||
|
||||
def rebalance_chunks(chunks)
|
||||
section = { ids: [], summary: "" }
|
||||
|
||||
chunks =
|
||||
chunks.reduce([]) do |sections, chunk|
|
||||
if llm_model.tokenizer_class.can_expand_tokens?(
|
||||
section[:summary],
|
||||
chunk[:summary],
|
||||
available_tokens,
|
||||
)
|
||||
section[:summary] += chunk[:summary]
|
||||
section[:ids] = section[:ids].concat(chunk[:ids])
|
||||
else
|
||||
sections << section
|
||||
section = chunk
|
||||
end
|
||||
|
||||
sections
|
||||
end
|
||||
|
||||
chunks << section if section[:summary].present?
|
||||
|
||||
chunks
|
||||
end
|
||||
|
||||
def summarize_single(text, user, opts, &on_partial_blk)
|
||||
prompt = summarization_prompt(text, opts)
|
||||
|
||||
llm.generate(prompt, user: user, feature_name: "summarize", &on_partial_blk)
|
||||
end
|
||||
|
||||
def summarize_in_chunks(chunks, user, opts)
|
||||
chunks.map do |chunk|
|
||||
prompt = summarization_prompt(chunk[:summary], opts)
|
||||
|
||||
chunk[:summary] = llm.generate(
|
||||
prompt,
|
||||
user: user,
|
||||
max_tokens: 300,
|
||||
feature_name: "summarize",
|
||||
)
|
||||
chunk
|
||||
end
|
||||
end
|
||||
|
||||
def concatenate_summaries(summaries, user, &on_partial_blk)
|
||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||
You are a summarization bot that effectively concatenates disjoint summaries, creating a cohesive narrative.
|
||||
The narrative you create is in the form of one or multiple paragraphs.
|
||||
Your reply MUST BE a single concatenated summary using the summaries I'll provide to you.
|
||||
I'm NOT interested in anything other than the concatenated summary, don't include additional text or comments.
|
||||
You understand and generate Discourse forum Markdown.
|
||||
You format the response, including links, using Markdown.
|
||||
TEXT
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{summaries.join("\n")}
|
||||
</input>
|
||||
TEXT
|
||||
|
||||
llm.generate(prompt, user: user, &on_partial_blk)
|
||||
end
|
||||
|
||||
def summarization_prompt(input, opts)
|
||||
insts = +<<~TEXT
|
||||
You are an advanced summarization bot that generates concise, coherent summaries of provided text.
|
||||
|
||||
- Only include the summary, without any additional commentary.
|
||||
- You understand and generate Discourse forum Markdown; including links, _italics_, **bold**.
|
||||
- Maintain the original language of the text being summarized.
|
||||
- Aim for summaries to be 400 words or less.
|
||||
|
||||
TEXT
|
||||
|
||||
insts << <<~TEXT if opts[:resource_path]
|
||||
- Each post is formatted as "<POST_NUMBER>) <USERNAME> <MESSAGE>"
|
||||
- Cite specific noteworthy posts using the format [NAME](#{opts[:resource_path]}/POST_NUMBER)
|
||||
- Example: link to the 3rd post by sam: [sam](#{opts[:resource_path]}/3)
|
||||
- Example: link to the 6th post by jane: [agreed with](#{opts[:resource_path]}/6)
|
||||
- Example: link to the 13th post by joe: [#13](#{opts[:resource_path]}/13)
|
||||
- When formatting usernames either use @USERNMAE OR [USERNAME](#{opts[:resource_path]}/POST_NUMBER)
|
||||
TEXT
|
||||
|
||||
prompt = DiscourseAi::Completions::Prompt.new(insts.strip)
|
||||
|
||||
if opts[:resource_path]
|
||||
prompt.push(
|
||||
type: :user,
|
||||
content:
|
||||
"Here are the posts inside <input></input> XML tags:\n\n<input>1) user1 said: I love Mondays 2) user2 said: I hate Mondays</input>\n\nGenerate a concise, coherent summary of the text above maintaining the original language.",
|
||||
)
|
||||
prompt.push(
|
||||
type: :model,
|
||||
content:
|
||||
"Two users are sharing their feelings toward Mondays. [user1](#{opts[:resource_path]}/1) hates them, while [user2](#{opts[:resource_path]}/2) loves them.",
|
||||
)
|
||||
end
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
|
||||
Here are the posts, inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{input}
|
||||
</input>
|
||||
|
||||
Generate a concise, coherent summary of the text above maintaining the original language.
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
|
||||
def available_tokens
|
||||
# Reserve tokens for the response and the base prompt
|
||||
# ~500 words
|
||||
reserved_tokens = 700
|
||||
|
||||
llm_model.max_prompt_tokens - reserved_tokens
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,90 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
module Strategies
|
||||
class TopicGist < Base
|
||||
def type
|
||||
AiSummary.summary_types[:gist]
|
||||
end
|
||||
|
||||
def targets_data
|
||||
content = { content_title: target.title, contents: [] }
|
||||
|
||||
op_post_number = 1
|
||||
|
||||
last_twenty_posts =
|
||||
Post
|
||||
.where(topic_id: target.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.order("post_number DESC")
|
||||
.limit(20)
|
||||
.pluck(:post_number)
|
||||
|
||||
posts_data =
|
||||
Post
|
||||
.where(topic_id: target.id)
|
||||
.joins(:user)
|
||||
.where("post_number IN (?)", last_twenty_posts << op_post_number)
|
||||
.order(:post_number)
|
||||
.pluck(:post_number, :raw, :username)
|
||||
|
||||
posts_data.each do |(pn, raw, username)|
|
||||
raw_text = raw
|
||||
|
||||
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
||||
raw_text = target.topic_embed&.embed_content_cache
|
||||
end
|
||||
|
||||
content[:contents] << { poster: username, id: pn, text: raw_text }
|
||||
end
|
||||
|
||||
content
|
||||
end
|
||||
|
||||
def concatenation_prompt(texts_to_summarize)
|
||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||
You are a summarization bot tasked with creating a single, concise sentence by merging disjointed summaries into a cohesive statement.
|
||||
Your response should strictly be this single, comprehensive sentence, without any additional text or comments.
|
||||
TEXT
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{texts_to_summarize.join("\n")}
|
||||
</input>
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
|
||||
def summarize_single_prompt(input, opts)
|
||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||
You are an advanced summarization bot. Your task is to analyze a given conversation and generate a single,
|
||||
concise sentence that clearly conveys the main topic and purpose of the discussion to someone with no prior context.
|
||||
|
||||
- Focus on the central theme or issue being addressed, while maintaining an objective and neutral tone.
|
||||
- Avoid including extraneous details or subjective opinions.
|
||||
- Maintain the original language of the text being summarized.
|
||||
TEXT
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
|
||||
|
||||
Here are the posts, inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{input}
|
||||
</input>
|
||||
|
||||
Generate a single sentence of the text above maintaining the original language.
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,142 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Summarization
|
||||
module Strategies
|
||||
class TopicSummary < Base
|
||||
def type
|
||||
AiSummary.summary_types[:complete]
|
||||
end
|
||||
|
||||
def targets_data
|
||||
content = {
|
||||
resource_path: "#{Discourse.base_path}/t/-/#{target.id}",
|
||||
content_title: target.title,
|
||||
contents: [],
|
||||
}
|
||||
|
||||
posts_data =
|
||||
(target.has_summary? ? best_replies : pick_selection).pluck(
|
||||
:post_number,
|
||||
:raw,
|
||||
:username,
|
||||
)
|
||||
|
||||
posts_data.each do |(pn, raw, username)|
|
||||
raw_text = raw
|
||||
|
||||
if pn == 1 && target.topic_embed&.embed_content_cache.present?
|
||||
raw_text = target.topic_embed&.embed_content_cache
|
||||
end
|
||||
|
||||
content[:contents] << { poster: username, id: pn, text: raw_text }
|
||||
end
|
||||
|
||||
content
|
||||
end
|
||||
|
||||
def concatenation_prompt(texts_to_summarize)
|
||||
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
||||
You are a summarization bot that effectively concatenates disjointed summaries, creating a cohesive narrative.
|
||||
The narrative you create is in the form of one or multiple paragraphs.
|
||||
Your reply MUST BE a single concatenated summary using the summaries I'll provide to you.
|
||||
I'm NOT interested in anything other than the concatenated summary, don't include additional text or comments.
|
||||
You understand and generate Discourse forum Markdown.
|
||||
You format the response, including links, using Markdown.
|
||||
TEXT
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{texts_to_summarize.join("\n")}
|
||||
</input>
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
|
||||
def summarize_single_prompt(input, opts)
|
||||
insts = +<<~TEXT
|
||||
You are an advanced summarization bot that generates concise, coherent summaries of provided text.
|
||||
|
||||
- Only include the summary, without any additional commentary.
|
||||
- You understand and generate Discourse forum Markdown; including links, _italics_, **bold**.
|
||||
- Maintain the original language of the text being summarized.
|
||||
- Aim for summaries to be 400 words or less.
|
||||
|
||||
TEXT
|
||||
|
||||
insts << <<~TEXT if opts[:resource_path]
|
||||
- Each post is formatted as "<POST_NUMBER>) <USERNAME> <MESSAGE>"
|
||||
- Cite specific noteworthy posts using the format [NAME](#{opts[:resource_path]}/POST_NUMBER)
|
||||
- Example: link to the 3rd post by sam: [sam](#{opts[:resource_path]}/3)
|
||||
- Example: link to the 6th post by jane: [agreed with](#{opts[:resource_path]}/6)
|
||||
- Example: link to the 13th post by joe: [#13](#{opts[:resource_path]}/13)
|
||||
- When formatting usernames either use @USERNMAE OR [USERNAME](#{opts[:resource_path]}/POST_NUMBER)
|
||||
TEXT
|
||||
|
||||
prompt = DiscourseAi::Completions::Prompt.new(insts.strip)
|
||||
|
||||
if opts[:resource_path]
|
||||
prompt.push(
|
||||
type: :user,
|
||||
content:
|
||||
"Here are the posts inside <input></input> XML tags:\n\n<input>1) user1 said: I love Mondays 2) user2 said: I hate Mondays</input>\n\nGenerate a concise, coherent summary of the text above maintaining the original language.",
|
||||
)
|
||||
prompt.push(
|
||||
type: :model,
|
||||
content:
|
||||
"Two users are sharing their feelings toward Mondays. [user1](#{opts[:resource_path]}/1) hates them, while [user2](#{opts[:resource_path]}/2) loves them.",
|
||||
)
|
||||
end
|
||||
|
||||
prompt.push(type: :user, content: <<~TEXT.strip)
|
||||
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
|
||||
Here are the posts, inside <input></input> XML tags:
|
||||
|
||||
<input>
|
||||
#{input}
|
||||
</input>
|
||||
|
||||
Generate a concise, coherent summary of the text above maintaining the original language.
|
||||
TEXT
|
||||
|
||||
prompt
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
attr_reader :topic
|
||||
|
||||
def best_replies
|
||||
Post
|
||||
.summary(target.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.joins(:user)
|
||||
.order(:post_number)
|
||||
end
|
||||
|
||||
def pick_selection
|
||||
posts =
|
||||
Post
|
||||
.where(topic_id: target.id)
|
||||
.where("post_type = ?", Post.types[:regular])
|
||||
.where("NOT hidden")
|
||||
.order(:post_number)
|
||||
|
||||
post_numbers = posts.limit(5).pluck(:post_number)
|
||||
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
|
||||
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
|
||||
|
||||
Post
|
||||
.where(topic_id: target.id)
|
||||
.joins(:user)
|
||||
.where("post_number in (?)", post_numbers)
|
||||
.order(:post_number)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -20,7 +20,7 @@ describe DiscourseAi::GuardianExtensions do
|
|||
it "returns false" do
|
||||
SiteSetting.ai_custom_summarization_allowed_groups = ""
|
||||
|
||||
expect(guardian.can_see_summary?(topic)).to eq(false)
|
||||
expect(guardian.can_see_summary?(topic, AiSummary.summary_types[:complete])).to eq(false)
|
||||
end
|
||||
|
||||
it "returns true if there is a cached summary" do
|
||||
|
@ -29,9 +29,10 @@ describe DiscourseAi::GuardianExtensions do
|
|||
summarized_text: "test",
|
||||
original_content_sha: "123",
|
||||
algorithm: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
expect(guardian.can_see_summary?(topic)).to eq(true)
|
||||
expect(guardian.can_see_summary?(topic, AiSummary.summary_types[:complete])).to eq(true)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -39,7 +40,7 @@ describe DiscourseAi::GuardianExtensions do
|
|||
before { SiteSetting.ai_custom_summarization_allowed_groups = group.id }
|
||||
|
||||
it "returns true if the user group is present in the ai_custom_summarization_allowed_groups_map setting" do
|
||||
expect(guardian.can_see_summary?(topic)).to eq(true)
|
||||
expect(guardian.can_see_summary?(topic, AiSummary.summary_types[:complete])).to eq(true)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -48,12 +49,12 @@ describe DiscourseAi::GuardianExtensions do
|
|||
let(:pm) { Fabricate(:private_message_topic) }
|
||||
|
||||
it "returns false" do
|
||||
expect(guardian.can_see_summary?(pm)).to eq(false)
|
||||
expect(guardian.can_see_summary?(pm, AiSummary.summary_types[:complete])).to eq(false)
|
||||
end
|
||||
|
||||
it "returns true if user is in a group that is allowed summaries" do
|
||||
SiteSetting.ai_pm_summarization_allowed_groups = group.id
|
||||
expect(guardian.can_see_summary?(pm)).to eq(true)
|
||||
expect(guardian.can_see_summary?(pm, AiSummary.summary_types[:complete])).to eq(true)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -61,7 +62,7 @@ describe DiscourseAi::GuardianExtensions do
|
|||
let(:guardian) { Guardian.new }
|
||||
|
||||
it "returns false for anons" do
|
||||
expect(guardian.can_see_summary?(topic)).to eq(false)
|
||||
expect(guardian.can_see_summary?(topic, AiSummary.summary_types[:complete])).to eq(false)
|
||||
end
|
||||
|
||||
it "returns true for anons when there is a cached summary" do
|
||||
|
@ -70,9 +71,10 @@ describe DiscourseAi::GuardianExtensions do
|
|||
summarized_text: "test",
|
||||
original_content_sha: "123",
|
||||
algorithm: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
expect(guardian.can_see_summary?(topic)).to eq(true)
|
||||
expect(guardian.can_see_summary?(topic, AiSummary.summary_types[:complete])).to eq(true)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe DiscourseAi::Summarization::Strategies::FoldContent do
|
||||
RSpec.describe DiscourseAi::Summarization::FoldContent do
|
||||
subject(:summarizer) { DiscourseAi::Summarization.topic_summary(topic) }
|
||||
|
||||
describe "#summarize" do
|
||||
let!(:llm_model) { assign_fake_provider_to(:ai_summarization_model) }
|
||||
|
||||
fab!(:topic) { Fabricate(:topic, highest_post_number: 2) }
|
||||
fab!(:post_1) { Fabricate(:post, topic: topic, post_number: 1, raw: "This is a text") }
|
||||
|
||||
before do
|
||||
SiteSetting.ai_summarization_enabled = true
|
||||
|
||||
|
@ -15,10 +20,6 @@ RSpec.describe DiscourseAi::Summarization::Strategies::FoldContent do
|
|||
llm_model.update!(max_prompt_tokens: model_tokens)
|
||||
end
|
||||
|
||||
let(:strategy) { DiscourseAi::Summarization.default_strategy }
|
||||
let(:summarize_text) { "This is a text" }
|
||||
let(:content) { { contents: [{ poster: "asd", id: 1, text: summarize_text }] } }
|
||||
|
||||
let(:single_summary) { "this is a single summary" }
|
||||
let(:concatenated_summary) { "this is a concatenated summary" }
|
||||
|
||||
|
@ -28,27 +29,26 @@ RSpec.describe DiscourseAi::Summarization::Strategies::FoldContent do
|
|||
it "does one call to summarize content" do
|
||||
result =
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses([single_summary]) do |spy|
|
||||
strategy.summarize(content, user).tap { expect(spy.completions).to eq(1) }
|
||||
summarizer.summarize(user).tap { expect(spy.completions).to eq(1) }
|
||||
end
|
||||
|
||||
expect(result[:summary]).to eq(single_summary)
|
||||
expect(result.summarized_text).to eq(single_summary)
|
||||
end
|
||||
end
|
||||
|
||||
context "when the content to summarize doesn't fit in a single call" do
|
||||
it "summarizes each chunk and then concatenates them" do
|
||||
content[:contents] << { poster: "asd2", id: 2, text: summarize_text }
|
||||
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2, raw: "This is a text") }
|
||||
|
||||
it "summarizes each chunk and then concatenates them" do
|
||||
result =
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses(
|
||||
[single_summary, single_summary, concatenated_summary],
|
||||
) { |spy| strategy.summarize(content, user).tap { expect(spy.completions).to eq(3) } }
|
||||
) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(3) } }
|
||||
|
||||
expect(result[:summary]).to eq(concatenated_summary)
|
||||
expect(result.summarized_text).to eq(concatenated_summary)
|
||||
end
|
||||
|
||||
it "keeps splitting into chunks until the content fits into a single call to create a cohesive narrative" do
|
||||
content[:contents] << { poster: "asd2", id: 2, text: summarize_text }
|
||||
max_length_response = "(1 asd said: This is a text "
|
||||
chunk_of_chunks = "I'm smol"
|
||||
|
||||
|
@ -61,9 +61,9 @@ RSpec.describe DiscourseAi::Summarization::Strategies::FoldContent do
|
|||
chunk_of_chunks,
|
||||
concatenated_summary,
|
||||
],
|
||||
) { |spy| strategy.summarize(content, user).tap { expect(spy.completions).to eq(5) } }
|
||||
) { |spy| summarizer.summarize(user).tap { expect(spy.completions).to eq(5) } }
|
||||
|
||||
expect(result[:summary]).to eq(concatenated_summary)
|
||||
expect(result.summarized_text).to eq(concatenated_summary)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,70 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe DiscourseAi::Summarization::Strategies::TopicGist do
|
||||
subject(:gist) { described_class.new(topic) }
|
||||
|
||||
fab!(:topic) { Fabricate(:topic, highest_post_number: 25) }
|
||||
fab!(:post_1) { Fabricate(:post, topic: topic, post_number: 1) }
|
||||
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2) }
|
||||
|
||||
describe "#targets_data" do
|
||||
context "when the topic has more than 20 posts" do
|
||||
before do
|
||||
offset = 3 # Already created posts 1 and 2
|
||||
(topic.highest_post_number - 2).times do |i|
|
||||
Fabricate(:post, topic: topic, post_number: i + offset)
|
||||
end
|
||||
end
|
||||
|
||||
it "includes the OP and the last 20 posts" do
|
||||
content = gist.targets_data
|
||||
post_numbers = content[:contents].map { |c| c[:id] }
|
||||
|
||||
expected = (6..25).to_a << 1
|
||||
|
||||
expect(post_numbers).to contain_exactly(*expected)
|
||||
end
|
||||
end
|
||||
|
||||
it "only includes visible posts" do
|
||||
post_2.update!(hidden: true)
|
||||
|
||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
||||
|
||||
expect(post_numbers).to contain_exactly(1)
|
||||
end
|
||||
|
||||
it "doesn't include posts without users" do
|
||||
post_2.update!(user_id: nil)
|
||||
|
||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
||||
|
||||
expect(post_numbers).to contain_exactly(1)
|
||||
end
|
||||
|
||||
it "doesn't include whispers" do
|
||||
post_2.update!(post_type: Post.types[:whisper])
|
||||
|
||||
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
|
||||
|
||||
expect(post_numbers).to contain_exactly(1)
|
||||
end
|
||||
|
||||
context "when the topic has embed content cached" do
|
||||
it "embed content is used instead of the raw text" do
|
||||
topic_embed =
|
||||
Fabricate(
|
||||
:topic_embed,
|
||||
topic: topic,
|
||||
embed_content_cache: "<p>hello world new post :D</p>",
|
||||
)
|
||||
|
||||
content = gist.targets_data
|
||||
|
||||
op_content = content[:contents].first[:text]
|
||||
|
||||
expect(op_content).to include(topic_embed.embed_content_cache)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,66 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe DiscourseAi::Summarization::Strategies::TopicSummary do
|
||||
subject(:topic_summary) { described_class.new(topic) }
|
||||
|
||||
fab!(:topic) { Fabricate(:topic, highest_post_number: 25) }
|
||||
fab!(:post_1) { Fabricate(:post, topic: topic, post_number: 1) }
|
||||
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2) }
|
||||
|
||||
describe "#targets_data" do
|
||||
shared_examples "includes only public-visible topics" do
|
||||
it "only includes visible posts" do
|
||||
post_2.update!(hidden: true)
|
||||
|
||||
post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] }
|
||||
|
||||
expect(post_numbers).to contain_exactly(1)
|
||||
end
|
||||
|
||||
it "doesn't include posts without users" do
|
||||
post_2.update!(user_id: nil)
|
||||
|
||||
post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] }
|
||||
|
||||
expect(post_numbers).to contain_exactly(1)
|
||||
end
|
||||
|
||||
it "doesn't include whispers" do
|
||||
post_2.update!(post_type: Post.types[:whisper])
|
||||
|
||||
post_numbers = topic_summary.targets_data[:contents].map { |c| c[:id] }
|
||||
|
||||
expect(post_numbers).to contain_exactly(1)
|
||||
end
|
||||
end
|
||||
|
||||
context "when the topic has a best replies summary" do
|
||||
before { topic.update(has_summary: true) }
|
||||
|
||||
it_behaves_like "includes only public-visible topics"
|
||||
end
|
||||
|
||||
context "when the topic doesn't have a best replies summary" do
|
||||
before { topic.update(has_summary: false) }
|
||||
|
||||
it_behaves_like "includes only public-visible topics"
|
||||
end
|
||||
|
||||
context "when the topic has embed content cached" do
|
||||
it "embed content is used instead of the raw text" do
|
||||
topic_embed =
|
||||
Fabricate(
|
||||
:topic_embed,
|
||||
topic: topic,
|
||||
embed_content_cache: "<p>hello world new post :D</p>",
|
||||
)
|
||||
|
||||
content = topic_summary.targets_data
|
||||
|
||||
op_content = content[:contents].first[:text]
|
||||
|
||||
expect(op_content).to include(topic_embed.embed_content_cache)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -19,6 +19,7 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
summarized_text: "test",
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
sign_in(Fabricate(:admin))
|
||||
|
@ -47,6 +48,7 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
summarized_text: "test",
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
get "/discourse-ai/summarization/t/#{topic.id}.json"
|
||||
|
@ -133,6 +135,7 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
summarized_text: "test",
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
get "/discourse-ai/summarization/t/#{topic.id}.json"
|
||||
|
|
|
@ -11,55 +11,14 @@ describe DiscourseAi::TopicSummarization do
|
|||
SiteSetting.ai_summarization_enabled = true
|
||||
end
|
||||
|
||||
let(:strategy) { DiscourseAi::Summarization.default_strategy }
|
||||
|
||||
shared_examples "includes only public-visible topics" do
|
||||
subject { DiscourseAi::TopicSummarization.new(strategy, topic, user) }
|
||||
|
||||
it "only includes visible posts" do
|
||||
topic.first_post.update!(hidden: true)
|
||||
|
||||
posts = subject.summary_targets
|
||||
|
||||
expect(posts.none?(&:hidden?)).to eq(true)
|
||||
end
|
||||
|
||||
it "doesn't include posts without users" do
|
||||
topic.first_post.user.destroy!
|
||||
|
||||
posts = subject.summary_targets
|
||||
|
||||
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
|
||||
end
|
||||
|
||||
it "doesn't include deleted posts" do
|
||||
topic.first_post.update!(user_id: nil)
|
||||
|
||||
posts = subject.summary_targets
|
||||
|
||||
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
|
||||
end
|
||||
end
|
||||
|
||||
describe "#summary_targets" do
|
||||
context "when the topic has a best replies summary" do
|
||||
before { topic.has_summary = true }
|
||||
|
||||
it_behaves_like "includes only public-visible topics"
|
||||
end
|
||||
|
||||
context "when the topic doesn't have a best replies summary" do
|
||||
before { topic.has_summary = false }
|
||||
|
||||
it_behaves_like "includes only public-visible topics"
|
||||
end
|
||||
end
|
||||
let(:strategy) { DiscourseAi::Summarization.topic_summary(topic) }
|
||||
|
||||
describe "#summarize" do
|
||||
subject(:summarization) { described_class.new(strategy, topic, user) }
|
||||
subject(:summarization) { described_class.new(strategy, user) }
|
||||
|
||||
def assert_summary_is_cached(topic, summary_response)
|
||||
cached_summary = AiSummary.find_by(target: topic)
|
||||
cached_summary =
|
||||
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete])
|
||||
|
||||
expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number))
|
||||
expect(cached_summary.summarized_text).to eq(summary)
|
||||
|
@ -82,41 +41,15 @@ describe DiscourseAi::TopicSummarization do
|
|||
summarization.summarize
|
||||
|
||||
cached_summary_text = "This is a cached summary"
|
||||
AiSummary.find_by(target: topic).update!(
|
||||
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete]).update!(
|
||||
summarized_text: cached_summary_text,
|
||||
updated_at: 24.hours.ago,
|
||||
)
|
||||
|
||||
summarization = described_class.new(strategy, topic, user)
|
||||
summarization = described_class.new(strategy, user)
|
||||
section = summarization.summarize
|
||||
expect(section.summarized_text).to eq(cached_summary_text)
|
||||
end
|
||||
|
||||
context "when the topic has embed content cached" do
|
||||
it "embed content is used instead of the raw text" do
|
||||
topic_embed =
|
||||
Fabricate(
|
||||
:topic_embed,
|
||||
topic: topic,
|
||||
embed_content_cache: "<p>hello world new post :D</p>",
|
||||
)
|
||||
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses(["A summary"]) do |spy|
|
||||
summarization.summarize
|
||||
|
||||
prompt_raw =
|
||||
spy
|
||||
.prompt_messages
|
||||
.reduce(+"") do |memo, m|
|
||||
memo << m[:content] << "\n"
|
||||
|
||||
memo
|
||||
end
|
||||
|
||||
expect(prompt_raw).to include(topic_embed.embed_content_cache)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "invalidating cached summaries" do
|
||||
|
@ -124,7 +57,7 @@ describe DiscourseAi::TopicSummarization do
|
|||
let(:updated_summary) { "This is the final summary" }
|
||||
|
||||
def cached_summary
|
||||
AiSummary.find_by(target: topic)
|
||||
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete])
|
||||
end
|
||||
|
||||
before do
|
||||
|
@ -133,8 +66,8 @@ describe DiscourseAi::TopicSummarization do
|
|||
# since it is glued to the old llm instance
|
||||
# so we create the cached summary totally independantly
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses([cached_text]) do
|
||||
strategy = DiscourseAi::Summarization.default_strategy
|
||||
described_class.new(strategy, topic, user).summarize
|
||||
strategy = DiscourseAi::Summarization.topic_summary(topic)
|
||||
described_class.new(strategy, user).summarize
|
||||
end
|
||||
|
||||
cached_summary.update!(summarized_text: cached_text, created_at: 24.hours.ago)
|
||||
|
|
|
@ -32,6 +32,7 @@ RSpec.describe "Summarize a topic ", type: :system do
|
|||
summarized_text: summarization_result,
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue