FEATURE: Use personas for generating hypothetical posts (#1482)

* FEATURE: Use personas for generating hypothetica posts

* Update prompt
This commit is contained in:
Roman Rizzi 2025-07-02 10:56:38 -03:00 committed by GitHub
parent 40fa527633
commit 75fb37144f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 158 additions and 30 deletions

View File

@ -222,6 +222,10 @@ en:
name: "Search"
description: "Enhances search experience by providing AI-generated answers to queries"
discoveries: "Discoveries"
embeddings:
name: "Embeddings"
description: "Powers features like Related Topics and AI Search by generating semantic representations of text"
hyde: "HyDE"
discord:
name: "Discord integration"
description: "Adds the ability to search Discord channels"

View File

@ -394,6 +394,9 @@ en:
spam_detector:
name: "Spam detector"
description: "Default persona powering our Spam detection feature"
content_creator:
name: "Content creator"
description: "Default persona powering HyDE search"
topic_not_found: "Summary unavailable, topic not found!"
summarizing: "Summarizing topic"

View File

@ -222,21 +222,30 @@ discourse_ai:
default: false
client: true
validator: "DiscourseAi::Configuration::EmbeddingsModuleValidator"
area: "ai-features/embeddings"
ai_embeddings_selected_model:
type: enum
default: ""
allow_any: false
enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
validator: "DiscourseAi::Configuration::EmbeddingDefsValidator"
area: "ai-features/embeddings"
ai_embeddings_per_post_enabled:
default: false
hidden: true
ai_embeddings_generate_for_pms: false
ai_embeddings_generate_for_pms:
default: false
area: "ai-features/embeddings"
ai_embeddings_semantic_related_topics_enabled:
default: false
client: true
ai_embeddings_semantic_related_topics: 5
ai_embeddings_semantic_related_include_closed_topics: true
area: "ai-features/embeddings"
ai_embeddings_semantic_related_topics:
default: 5
area: "ai-features/embeddings"
ai_embeddings_semantic_related_include_closed_topics:
default: true
area: "ai-features/embeddings"
ai_embeddings_backfill_batch_size:
default: 250
hidden: true
@ -244,12 +253,14 @@ discourse_ai:
default: false
client: true
validator: "DiscourseAi::Configuration::LlmDependencyValidator"
area: "ai-features/embeddings"
ai_embeddings_semantic_search_hyde_model:
default: ""
type: enum
allow_any: false
enum: "DiscourseAi::Configuration::LlmEnumerator"
validator: "DiscourseAi::Configuration::LlmValidator"
area: "ai-features/embeddings"
ai_embeddings_semantic_search_hyde_model_allowed_seeded_models:
default: ""
hidden: true
@ -259,6 +270,12 @@ discourse_ai:
default: false
client: true
hidden: true
area: "ai-features/embeddings"
ai_embeddings_semantic_search_hyde_persona:
default: "-32"
type: enum
enum: "DiscourseAi::Configuration::PersonaEnumerator"
area: "ai-features/embeddings"
ai_embeddings_discourse_service_api_endpoint:
default: ""

View File

@ -36,6 +36,8 @@ DiscourseAi::Personas::Persona.system_personas.each do |persona_class, id|
setting_name = "ai_helper_custom_prompts_allowed_groups"
default_groups = [Group::AUTO_GROUPS[:staff]]
persona.allowed_group_ids = from_setting(setting_name) || default_groups
elsif persona_class == DiscourseAi::Personas::ContentCreator
persona.allowed_group_ids = [Group::AUTO_GROUPS[:everyone]]
else
persona.allowed_group_ids = [Group::AUTO_GROUPS[:trust_level_0]]
end

View File

@ -144,6 +144,17 @@ module DiscourseAi
]
end
def embeddings_features
feature_cache[:embeddings] ||= [
new(
"hyde",
"ai_embeddings_semantic_search_hyde_persona",
DiscourseAi::Configuration::Module::EMBEDDINGS_ID,
DiscourseAi::Configuration::Module::EMBEDDINGS,
),
]
end
def lookup_bot_persona_ids
AiPersona
.where(enabled: true)
@ -196,6 +207,7 @@ module DiscourseAi
translation_features,
bot_features,
spam_features,
embeddings_features,
].flatten
end
@ -241,6 +253,8 @@ module DiscourseAi
DiscourseAi::AiHelper::Assistant.find_ai_helper_model(name, persona_klass)
when DiscourseAi::Configuration::Module::TRANSLATION
DiscourseAi::Translation::BaseTranslator.preferred_llm_model(persona_klass)
when DiscourseAi::Configuration::Module::EMBEDDINGS
DiscourseAi::Embeddings::SemanticSearch.new(nil).find_ai_hyde_model(persona_klass)
end
if llm_model.blank? && persona.default_llm_id

View File

@ -11,8 +11,19 @@ module DiscourseAi
TRANSLATION = "translation"
BOT = "bot"
SPAM = "spam"
EMBEDDINGS = "embeddings"
NAMES = [SUMMARIZATION, SEARCH, DISCORD, INFERENCE, AI_HELPER, TRANSLATION, BOT, SPAM].freeze
NAMES = [
SUMMARIZATION,
SEARCH,
DISCORD,
INFERENCE,
AI_HELPER,
TRANSLATION,
BOT,
SPAM,
EMBEDDINGS,
].freeze
SUMMARIZATION_ID = 1
SEARCH_ID = 2
@ -22,6 +33,7 @@ module DiscourseAi
TRANSLATION_ID = 6
BOT_ID = 7
SPAM_ID = 8
EMBEDDINGS_ID = 9
class << self
def all
@ -75,6 +87,13 @@ module DiscourseAi
enabled_by_setting: "ai_spam_detection_enabled",
features: DiscourseAi::Configuration::Feature.spam_features,
),
new(
EMBEDDINGS_ID,
EMBEDDINGS,
enabled_by_setting: "ai_embeddings_enabled",
features: DiscourseAi::Configuration::Feature.embeddings_features,
extra_check: -> { SiteSetting.ai_embeddings_semantic_search_enabled },
),
]
end

View File

@ -78,7 +78,9 @@ module DiscourseAi
return Post.none
end
search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term)
search_embedding = nil
search_embedding = hyde_embedding(search_term) if hyde
search_embedding = embedding(search_term) if search_embedding.blank?
over_selection_limit = limit * OVER_SELECTION_FACTOR
@ -176,26 +178,47 @@ module DiscourseAi
end
def hypothetical_post_from(search_term)
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
You are a content creator for a forum. The forum description is as follows:
#{SiteSetting.title}
#{SiteSetting.site_description}
context =
DiscourseAi::Personas::BotContext.new(
user: @guardian.user,
skip_tool_details: true,
feature_name: "semantic_search_hyde",
messages: [{ type: :user, content: search_term }],
)
Put the forum post between <ai></ai> tags.
TEXT
bot = build_bot(@guardian.user)
return nil if bot.nil?
prompt.push(type: :user, content: <<~TEXT.strip)
Using this description, write a forum post about the subject inside the <input></input> XML tags:
structured_output = nil
raw_response = +""
hyde_schema_key = bot.persona.response_format&.first.to_h
<input>#{search_term}</input>
TEXT
buffer_blk =
Proc.new do |partial, _, type|
if type == :structured_output
structured_output = partial
elsif type.blank?
# Assume response is a regular completion.
raw_response << partial
end
end
llm_response =
DiscourseAi::Completions::Llm.proxy(
SiteSetting.ai_embeddings_semantic_search_hyde_model,
).generate(prompt, user: @guardian.user, feature_name: "semantic_search_hyde")
bot.reply(context, &buffer_blk)
Nokogiri::HTML5.fragment(llm_response).at("ai")&.text.presence || llm_response
structured_output&.read_buffered_property(hyde_schema_key["key"]&.to_sym) || raw_response
end
# Priorities are:
# 1. Persona's default LLM
# 2. `ai_embeddings_semantic_search_hyde_model` setting.
def find_ai_hyde_model(persona_klass)
model_id =
persona_klass.default_llm_id ||
SiteSetting.ai_embeddings_semantic_search_hyde_model&.split(":")&.last
return if model_id.blank?
LlmModel.find_by(id: model_id)
end
private
@ -209,6 +232,18 @@ module DiscourseAi
def build_embedding_key(digest, hyde_model, embedding_model)
"#{build_hyde_key(digest, hyde_model)}-#{embedding_model}"
end
def build_bot(user)
persona_id = SiteSetting.ai_embeddings_semantic_search_hyde_persona
persona_klass = AiPersona.find_by(id: persona_id)&.class_instance
return if persona_klass.nil?
llm_model = find_ai_hyde_model(persona_klass)
return if llm_model.nil?
DiscourseAi::Personas::Bot.as(user, persona: persona_klass.new, model: llm_model)
end
end
end
end

View File

@ -171,7 +171,7 @@ module DiscourseAi
text = +""
result.each { |item| text << item if item.is_a?(String) }
end
raw_context << [text, bot_user.username]
raw_context << [text, bot_user&.username]
end
total_completions += 1

View File

@ -0,0 +1,33 @@
# frozen_string_literal: true
module DiscourseAi
module Personas
class ContentCreator < Persona
def self.default_enabled
false
end
def system_prompt
<<~PROMPT.strip
You are a content creator for a forum. The forum title and description is as follows:
* Ttitle: {site_title}
* Description: {site_description}
You will receive a couple of keywords and must create a post about the keywords, keeping the previous information in mind.
Format your response as a JSON object with a single key named "output", which has the created content.
Your output should be in the following format:
<output>
{"output": "xx"}
</output>
Where "xx" is replaced by the content.
PROMPT
end
def response_format
[{ "key" => "output", "type" => "string" }]
end
end
end
end

View File

@ -69,6 +69,7 @@ module DiscourseAi
TopicTitleTranslator => -29,
ShortTextTranslator => -30,
SpamDetector => -31,
ContentCreator => -32,
}
end

View File

@ -27,7 +27,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
end
def trigger_search(query)
DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{hypothetical_post}</ai>"]) do
DiscourseAi::Completions::Llm.with_prepared_responses([hypothetical_post]) do
subject.search_for_topics(query)
end
end
@ -123,9 +123,9 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
context "while searching as anon" do
it "returns an empty list" do
posts =
DiscourseAi::Completions::Llm.with_prepared_responses(
["<ai>#{hypothetical_post}</ai>"],
) { described_class.new(Guardian.new(nil)).search_for_topics(query) }
DiscourseAi::Completions::Llm.with_prepared_responses([hypothetical_post]) do
described_class.new(Guardian.new(nil)).search_for_topics(query)
end
expect(posts).to be_empty
end

View File

@ -125,7 +125,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Search do
DiscourseAi::Embeddings::Schema.for(Topic).store(post1.topic, hyde_embedding, "digest")
results =
DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
search.invoke(&progress_blk)
end
@ -144,7 +144,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Search do
# results will be expanded by semantic search, but it will find nothing
results =
DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
search.invoke(&progress_blk)
end

View File

@ -154,7 +154,7 @@ RSpec.describe DiscourseAi::Utils::Search do
# Using a completely different search query, should still find via semantic search
results =
DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
described_class.perform_search(
search_query: "totally different query",
current_user: admin,

View File

@ -19,7 +19,7 @@ RSpec.describe DiscourseAi::Admin::AiFeaturesController do
get "/admin/plugins/discourse-ai/ai-features.json"
expect(response.status).to eq(200)
expect(response.parsed_body["ai_features"].count).to eq(8)
expect(response.parsed_body["ai_features"].count).to eq(9)
end
end

View File

@ -28,7 +28,7 @@ RSpec.describe "Admin AI features configuration", type: :system, js: true do
ai_features_page.toggle_unconfigured
# this changes as we add more AI features
expect(ai_features_page).to have_listed_modules(7)
expect(ai_features_page).to have_listed_modules(8)
end
it "lists the persona used for the corresponding AI feature" do