FEATURE: Generate topic gists for the hot topics list. (#837)

* Display gists in the hot topics list

* Adjust hot topics gist strategy and add a job to generate gists

* Replace setting with a configurable batch size

* Avoid loading summaries for other topic lists

* Tweak gist prompt to focus on latest posts in the context of the OP

* Remove serializer hack and rely on core change from discourse/discourse#29291

* Update lib/summarization/strategies/hot_topic_gists.rb

Co-authored-by: Rafael dos Santos Silva <xfalcox@gmail.com>

---------

Co-authored-by: Rafael dos Santos Silva <xfalcox@gmail.com>
This commit is contained in:
Roman Rizzi 2024-10-18 18:01:39 -03:00 committed by GitHub
parent decf1bb49d
commit 27b5542357
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 455 additions and 107 deletions

View File

@ -1,3 +1,4 @@
< 3.4.0.beta3-dev: ecf1bb49d737ea15308400f22f89d1d1e71d13d
< 3.4.0.beta1-dev: 9d887ad4ace8e33c3fe7dbb39237e882c08b4f0b
< 3.3.0.beta5-dev: 4d8090002f6dcd8e34d41033606bf131fa221475
< 3.3.0.beta2-dev: 61890b667c06299841ae88946f84a112f00060e1

View File

@ -0,0 +1,24 @@
# frozen_string_literal: true
module ::Jobs
class HotTopicsGistBatch < ::Jobs::Base
def execute(args)
return if !SiteSetting.discourse_ai_enabled
return if !SiteSetting.ai_summarization_enabled
return if SiteSetting.ai_summarize_max_hot_topics_gists_per_batch.zero?
Topic
.joins("JOIN topic_hot_scores on topics.id = topic_hot_scores.topic_id")
.order("topic_hot_scores.score DESC")
.limit(SiteSetting.ai_summarize_max_hot_topics_gists_per_batch)
.each do |topic|
summarizer = DiscourseAi::Summarization.topic_gist(topic)
gist = summarizer.existing_summary
summarizer.delete_cached_summaries! if gist && gist.outdated
summarizer.summarize(Discourse.system_user)
end
end
end
end

View File

@ -0,0 +1,15 @@
import Component from "@glimmer/component";
export default class AiTopicGist extends Component {
static shouldRender(outletArgs) {
return outletArgs?.topic?.ai_topic_gist && !outletArgs.topic.excerpt;
}
<template>
<div class="ai-topic-gist">
<div class="ai-topic-gist__text">
{{@outletArgs.topic.ai_topic_gist}}
</div>
</div>
</template>
}

View File

@ -215,3 +215,11 @@
opacity: 1;
}
}
.ai-topic-gist {
margin-top: 0.5em;
&__text {
font-size: var(--font-down-2);
}
}

View File

@ -84,6 +84,7 @@ en:
ai_summarization_model: "Model to use for summarization."
ai_custom_summarization_allowed_groups: "Groups allowed to use create new summaries."
ai_pm_summarization_allowed_groups: "Groups allowed to create and view summaries in PMs."
ai_summarize_max_hot_topics_gists_per_batch: "After updating topics in the hot list, we'll generate brief summaries of the first N ones. (Disabled when 0)"
ai_bot_enabled: "Enable the AI Bot module."
ai_bot_enable_chat_warning: "Display a warning when PM chat is initiated. Can be overriden by editing the translation string: discourse_ai.ai_bot.pm_warning"

View File

@ -376,6 +376,10 @@ discourse_ai:
type: group_list
list_type: compact
default: "3|13" # 3: @staff, 13: @trust_level_3
ai_summarize_max_hot_topics_gists_per_batch:
default: 0
min: 0
max: 1000
ai_summarization_strategy: # TODO(roman): Deprecated. Remove by Sept 2024
type: enum
default: ""

View File

@ -17,7 +17,7 @@ module DiscourseAi
if SiteSetting.ai_summarization_model.present? && SiteSetting.ai_summarization_enabled
DiscourseAi::Summarization::FoldContent.new(
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_summarization_model),
DiscourseAi::Summarization::Strategies::TopicGist.new(topic),
DiscourseAi::Summarization::Strategies::HotTopicGists.new(topic),
)
else
nil

View File

@ -16,6 +16,38 @@ module DiscourseAi
plugin.add_to_serializer(:web_hook_topic_view, :summarizable) do
scope.can_see_summary?(object.topic, AiSummary.summary_types[:complete])
end
plugin.register_modifier(:topic_query_create_list_topics) do |topics, options|
if options[:filter] == :hot && SiteSetting.ai_summarization_enabled &&
SiteSetting.ai_summarize_max_hot_topics_gists_per_batch > 0
topics.includes(:ai_summaries).where(
"ai_summaries.id IS NULL OR ai_summaries.summary_type = ?",
AiSummary.summary_types[:gist],
)
else
topics
end
end
plugin.add_to_serializer(
:topic_list_item,
:ai_topic_gist,
include_condition: -> do
SiteSetting.ai_summarization_enabled &&
SiteSetting.ai_summarize_max_hot_topics_gists_per_batch > 0 &&
options[:filter] == :hot
end,
) do
summaries = object.ai_summaries.to_a
# Summaries should always have one or zero elements here.
# This is an extra safeguard to avoid including regular summaries.
summaries.find { |s| s.summary_type == "gist" }&.summarized_text
end
# To make sure hot topic gists are inmediately up to date, we rely on this event
# instead of using a scheduled job.
plugin.on(:topic_hot_scores_updated) { Jobs.enqueue(:hot_topics_gist_batch) }
end
end
end

View File

@ -0,0 +1,121 @@
# frozen_string_literal: true
module DiscourseAi
module Summarization
module Strategies
class HotTopicGists < Base
def type
AiSummary.summary_types[:gist]
end
def targets_data
content = { content_title: target.title, contents: [] }
op_post_number = 1
hot_topics_recent_cutoff = Time.zone.now - SiteSetting.hot_topics_recent_days.days
recent_hot_posts =
Post
.where(topic_id: target.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.where("created_at >= ?", hot_topics_recent_cutoff)
.pluck(:post_number)
# It may happen that a topic is hot without any recent posts
# In that case, we'll just grab the last 20 posts
# for an useful summary of the current state of the topic
if recent_hot_posts.empty?
recent_hot_posts =
Post
.where(topic_id: target.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.order("post_number DESC")
.limit(20)
.pluck(:post_number)
end
posts_data =
Post
.where(topic_id: target.id)
.joins(:user)
.where("post_number IN (?)", recent_hot_posts << op_post_number)
.order(:post_number)
.pluck(:post_number, :raw, :username)
posts_data.each do |(pn, raw, username)|
raw_text = raw
if pn == 1 && target.topic_embed&.embed_content_cache.present?
raw_text = target.topic_embed&.embed_content_cache
end
content[:contents] << { poster: username, id: pn, text: raw_text }
end
content
end
def concatenation_prompt(texts_to_summarize)
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
You are a summarization bot tasked with creating a single, concise sentence by merging disjointed summaries into a cohesive statement.
Your response should strictly be this single, comprehensive sentence, without any additional text or comments.
- Focus on the central theme or issue being addressed, maintaining an objective and neutral tone.
- Exclude extraneous details or subjective opinions.
- Use the original language of the text.
- Begin directly with the main topic or issue, avoiding introductory phrases.
- Limit the summary to a maximum of 20 words.
TEXT
prompt.push(type: :user, content: <<~TEXT.strip)
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
<input>
#{texts_to_summarize.join("\n")}
</input>
TEXT
prompt
end
def summarize_single_prompt(input, opts)
statements = input.split(/(?=\d+\) \w+ said:)/)
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
single-sentence summary that conveys the main topic and current developments to someone with no prior context.
### Guidelines:
- Emphasize the most recent updates while considering their significance within the original post.
- Focus on the central theme or issue being addressed, maintaining an objective and neutral tone.
- Exclude extraneous details or subjective opinions.
- Use the original language of the text.
- Begin directly with the main topic or issue, avoiding introductory phrases.
- Limit the summary to a maximum of 20 words.
TEXT
prompt.push(type: :user, content: <<~TEXT.strip)
### Context:
The conversation began with the following statement:
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
#{statements&.pop}
Subsequent discussion includes the following:
#{statements&.join}
Your task is to focus on these latest messages, capturing their meaning in the context of the initial post.
TEXT
prompt
end
end
end
end
end

View File

@ -1,90 +0,0 @@
# frozen_string_literal: true
module DiscourseAi
module Summarization
module Strategies
class TopicGist < Base
def type
AiSummary.summary_types[:gist]
end
def targets_data
content = { content_title: target.title, contents: [] }
op_post_number = 1
last_twenty_posts =
Post
.where(topic_id: target.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.order("post_number DESC")
.limit(20)
.pluck(:post_number)
posts_data =
Post
.where(topic_id: target.id)
.joins(:user)
.where("post_number IN (?)", last_twenty_posts << op_post_number)
.order(:post_number)
.pluck(:post_number, :raw, :username)
posts_data.each do |(pn, raw, username)|
raw_text = raw
if pn == 1 && target.topic_embed&.embed_content_cache.present?
raw_text = target.topic_embed&.embed_content_cache
end
content[:contents] << { poster: username, id: pn, text: raw_text }
end
content
end
def concatenation_prompt(texts_to_summarize)
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
You are a summarization bot tasked with creating a single, concise sentence by merging disjointed summaries into a cohesive statement.
Your response should strictly be this single, comprehensive sentence, without any additional text or comments.
TEXT
prompt.push(type: :user, content: <<~TEXT.strip)
THESE are the summaries, each one separated by a newline, all of them inside <input></input> XML tags:
<input>
#{texts_to_summarize.join("\n")}
</input>
TEXT
prompt
end
def summarize_single_prompt(input, opts)
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
You are an advanced summarization bot. Your task is to analyze a given conversation and generate a single,
concise sentence that clearly conveys the main topic and purpose of the discussion to someone with no prior context.
- Focus on the central theme or issue being addressed, while maintaining an objective and neutral tone.
- Avoid including extraneous details or subjective opinions.
- Maintain the original language of the text being summarized.
TEXT
prompt.push(type: :user, content: <<~TEXT.strip)
#{opts[:content_title].present? ? "The discussion title is: " + opts[:content_title] + ".\n" : ""}
Here are the posts, inside <input></input> XML tags:
<input>
#{input}
</input>
Generate a single sentence of the text above maintaining the original language.
TEXT
prompt
end
end
end
end
end

9
lib/topic_extensions.rb Normal file
View File

@ -0,0 +1,9 @@
# frozen_string_literal: true
module DiscourseAi
module TopicExtensions
extend ActiveSupport::Concern
prepended { has_many :ai_summaries, as: :target }
end
end

View File

@ -76,7 +76,10 @@ after_initialize do
require_relative "spec/support/stable_diffusion_stubs"
end
reloadable_patch { |plugin| Guardian.prepend DiscourseAi::GuardianExtensions }
reloadable_patch do |plugin|
Guardian.prepend DiscourseAi::GuardianExtensions
Topic.prepend DiscourseAi::TopicExtensions
end
register_modifier(:post_should_secure_uploads?) do |_, _, topic|
if topic.private_message? && SharedAiConversation.exists?(target: topic)

View File

@ -0,0 +1,14 @@
# frozen_string_literal: true
Fabricator(:ai_summary) do
summarized_text "complete summary"
original_content_sha "123"
algorithm "test"
target { Fabricate(:topic) }
summary_type AiSummary.summary_types[:complete]
end
Fabricator(:topic_ai_gist, from: :ai_summary) do
summarized_text "gist"
summary_type AiSummary.summary_types[:gist]
end

View File

@ -0,0 +1,121 @@
# frozen_string_literal: true
RSpec.describe Jobs::HotTopicsGistBatch do
fab!(:topic_1) { Fabricate(:topic) }
fab!(:post_1) { Fabricate(:post, topic: topic_1, post_number: 1) }
fab!(:post_2) { Fabricate(:post, topic: topic_1, post_number: 2) }
before do
assign_fake_provider_to(:ai_summarization_model)
SiteSetting.ai_summarization_enabled = true
SiteSetting.ai_summarize_max_hot_topics_gists_per_batch = 100
end
describe "#execute" do
context "when there is a topic with a hot score" do
before { TopicHotScore.create!(topic_id: topic_1.id, score: 0.1) }
it "does nothing if the plugin is disabled" do
SiteSetting.discourse_ai_enabled = false
subject.execute({})
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist).to be_nil
end
it "does nothing if the summarization module is disabled" do
SiteSetting.ai_summarization_enabled = false
subject.execute({})
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist).to be_nil
end
it "does nothing if hot topics summarization is disabled" do
SiteSetting.ai_summarize_max_hot_topics_gists_per_batch = 0
subject.execute({})
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist).to be_nil
end
it "creates a gist" do
gist_result = "I'm a gist"
DiscourseAi::Completions::Llm.with_prepared_responses([gist_result]) { subject.execute({}) }
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist.summarized_text).to eq(gist_result)
end
context "when we already generated a gist of it" do
fab!(:ai_gist) do
Fabricate(
:topic_ai_gist,
target: topic_1,
original_content_sha: AiSummary.build_sha("12"),
)
end
it "does nothing if the gist is up to date" do
subject.execute({})
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist.summarized_text).to eq(ai_gist.summarized_text)
expect(gist.original_content_sha).to eq(ai_gist.original_content_sha)
end
it "regenerates it if it's outdated" do
Fabricate(:post, topic: topic_1, post_number: 3)
gist_result = "They updated me"
DiscourseAi::Completions::Llm.with_prepared_responses([gist_result]) do
subject.execute({})
end
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist.summarized_text).to eq(gist_result)
expect(gist.original_content_sha).to eq(AiSummary.build_sha("123"))
end
end
end
context "when there is a topic but it doesn't have a hot score" do
it "does nothing" do
subject.execute({})
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist).to be_nil
end
end
context "when there are multiple hot topics" do
fab!(:topic_2) { Fabricate(:topic) }
fab!(:post_2_1) { Fabricate(:post, topic: topic_2, post_number: 1) }
fab!(:post_2_2) { Fabricate(:post, topic: topic_2, post_number: 2) }
before do
TopicHotScore.create!(topic_id: topic_1.id, score: 0.2)
TopicHotScore.create!(topic_id: topic_2.id, score: 0.4)
end
it "processes them by score order" do
topic_1_gist = "I'm gist of topic 1"
topic_2_gist = "I'm gist of topic 2"
DiscourseAi::Completions::Llm.with_prepared_responses([topic_2_gist, topic_1_gist]) do
subject.execute({})
end
gist = AiSummary.gist.find_by(target: topic_1)
expect(gist.summarized_text).to eq(topic_1_gist)
gist_2 = AiSummary.gist.find_by(target: topic_2)
expect(gist_2.summarized_text).to eq(topic_2_gist)
end
end
end
end

View File

@ -0,0 +1,94 @@
# frozen_string_literal: true
RSpec.describe DiscourseAi::Summarization::EntryPoint do
before do
assign_fake_provider_to(:ai_summarization_model)
SiteSetting.ai_summarization_enabled = true
end
fab!(:user)
describe "#inject_into" do
describe "hot topics gist summarization" do
fab!(:topic_ai_gist)
fab!(:regular_summary) { Fabricate(:ai_summary, target: topic_ai_gist.target) }
before { TopicHotScore.create!(topic_id: topic_ai_gist.target_id, score: 1.0) }
let(:topic_query) { TopicQuery.new(user) }
describe "topic_query_create_list_topics modifier" do
context "when hot topic summarization is enabled" do
before { SiteSetting.ai_summarize_max_hot_topics_gists_per_batch = 100 }
it "preloads only gist summaries" do
gist_topic = topic_query.list_hot.topics.find { |t| t.id == topic_ai_gist.target_id }
expect(gist_topic.ai_summaries.size).to eq(1)
expect(gist_topic.ai_summaries.first).to eq(topic_ai_gist)
end
it "doesn't filter out hot topics without summaries" do
TopicHotScore.create!(topic_id: Fabricate(:topic).id, score: 1.0)
expect(topic_query.list_hot.topics.size).to eq(2)
end
end
end
describe "topic_list_item serializer's ai_summary" do
context "when hot topic summarization is disabled" do
it "doesn't include summaries" do
gist_topic = topic_query.list_hot.topics.find { |t| t.id == topic_ai_gist.target_id }
serialized =
TopicListItemSerializer.new(gist_topic, scope: Guardian.new, root: false).as_json
expect(serialized.has_key?(:ai_topic_gist)).to eq(false)
end
end
context "when hot topics summarization is enabled" do
before { SiteSetting.ai_summarize_max_hot_topics_gists_per_batch = 100 }
it "includes the summary" do
gist_topic = topic_query.list_hot.topics.find { |t| t.id == topic_ai_gist.target_id }
serialized =
TopicListItemSerializer.new(
gist_topic,
scope: Guardian.new,
root: false,
filter: :hot,
).as_json
expect(serialized[:ai_topic_gist]).to be_present
end
it "doesn't include the summary when looking at other topic lists" do
gist_topic = topic_query.list_hot.topics.find { |t| t.id == topic_ai_gist.target_id }
serialized =
TopicListItemSerializer.new(
gist_topic,
scope: Guardian.new,
root: false,
filter: :latest,
).as_json
expect(serialized[:ai_topic_gist]).to be_nil
end
end
end
end
end
describe "#on topic_hot_scores_updated" do
it "queues a job to generate gists" do
expect { DiscourseEvent.trigger(:topic_hot_scores_updated) }.to change(
Jobs::HotTopicsGistBatch.jobs,
:size,
).by(1)
end
end
end

View File

@ -1,6 +1,6 @@
# frozen_string_literal: true
RSpec.describe DiscourseAi::Summarization::Strategies::TopicGist do
RSpec.describe DiscourseAi::Summarization::Strategies::HotTopicGists do
subject(:gist) { described_class.new(topic) }
fab!(:topic) { Fabricate(:topic, highest_post_number: 25) }
@ -8,22 +8,13 @@ RSpec.describe DiscourseAi::Summarization::Strategies::TopicGist do
fab!(:post_2) { Fabricate(:post, topic: topic, post_number: 2) }
describe "#targets_data" do
context "when the topic has more than 20 posts" do
before do
offset = 3 # Already created posts 1 and 2
(topic.highest_post_number - 2).times do |i|
Fabricate(:post, topic: topic, post_number: i + offset)
end
end
it "respects the `hot_topics_recent_days` setting" do
post_2.update(created_at: (SiteSetting.hot_topics_recent_days + 1).days.ago)
Fabricate(:post, topic: topic, post_number: 3)
it "includes the OP and the last 20 posts" do
content = gist.targets_data
post_numbers = content[:contents].map { |c| c[:id] }
post_numbers = gist.targets_data[:contents].map { |c| c[:id] }
expected = (6..25).to_a << 1
expect(post_numbers).to contain_exactly(*expected)
end
expect(post_numbers).to contain_exactly(1, 3)
end
it "only includes visible posts" do