FEATURE: Use personas for generating hypothetical posts (#1482)

* FEATURE: Use personas for generating hypothetica posts * Update prompt
2025-07-06 14:32:14 +00:00 · 2025-07-02 10:56:38 -03:00 · 2025-07-02 10:56:38 -03:00 · 75fb37144f
commit 75fb37144f
parent 40fa527633
15 changed files with 158 additions and 30 deletions
--- a/config/locales/client.en.yml
+++ b/config/locales/client.en.yml
@ -222,6 +222,10 @@ en:
          name: "Search"
          description: "Enhances search experience by providing AI-generated answers to queries"
          discoveries: "Discoveries"
        embeddings:
          name: "Embeddings"
          description: "Powers features like Related Topics and AI Search by generating semantic representations of text"
          hyde: "HyDE"
        discord:
          name: "Discord integration"
          description: "Adds the ability to search Discord channels"
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -394,6 +394,9 @@ en:
        spam_detector:
          name: "Spam detector"
          description: "Default persona powering our Spam detection feature"
        content_creator:
          name: "Content creator"
          description: "Default persona powering HyDE search"
      topic_not_found: "Summary unavailable, topic not found!"
      summarizing: "Summarizing topic"
--- a/config/settings.yml
+++ b/config/settings.yml
@ -222,21 +222,30 @@ discourse_ai:
    default: false
    client: true
    validator: "DiscourseAi::Configuration::EmbeddingsModuleValidator"
    area: "ai-features/embeddings"
  ai_embeddings_selected_model:
    type: enum
    default: ""
    allow_any: false
    enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
    validator: "DiscourseAi::Configuration::EmbeddingDefsValidator"
    area: "ai-features/embeddings"
  ai_embeddings_per_post_enabled:
    default: false
    hidden: true
-  ai_embeddings_generate_for_pms: false
+  ai_embeddings_generate_for_pms: 
    default: false
    area: "ai-features/embeddings"
  ai_embeddings_semantic_related_topics_enabled:
    default: false
    client: true
-  ai_embeddings_semantic_related_topics: 5
+    area: "ai-features/embeddings"
-  ai_embeddings_semantic_related_include_closed_topics: true
+  ai_embeddings_semantic_related_topics: 
    default: 5
    area: "ai-features/embeddings"
  ai_embeddings_semantic_related_include_closed_topics: 
    default: true
    area: "ai-features/embeddings"
  ai_embeddings_backfill_batch_size:
    default: 250
    hidden: true
@ -244,12 +253,14 @@ discourse_ai:
    default: false
    client: true
    validator: "DiscourseAi::Configuration::LlmDependencyValidator"
    area: "ai-features/embeddings"
  ai_embeddings_semantic_search_hyde_model:
    default: ""
    type: enum
    allow_any: false
    enum: "DiscourseAi::Configuration::LlmEnumerator"
    validator: "DiscourseAi::Configuration::LlmValidator"
    area: "ai-features/embeddings"
  ai_embeddings_semantic_search_hyde_model_allowed_seeded_models:
    default: ""
    hidden: true
@ -259,6 +270,12 @@ discourse_ai:
    default: false
    client: true
    hidden: true
    area: "ai-features/embeddings"
  ai_embeddings_semantic_search_hyde_persona:
    default: "-32"
    type: enum
    enum: "DiscourseAi::Configuration::PersonaEnumerator"
    area: "ai-features/embeddings"
  ai_embeddings_discourse_service_api_endpoint:
    default: ""
--- a/db/fixtures/personas/603_ai_personas.rb
+++ b/db/fixtures/personas/603_ai_personas.rb
@ -36,6 +36,8 @@ DiscourseAi::Personas::Persona.system_personas.each do |persona_class, id|
      setting_name = "ai_helper_custom_prompts_allowed_groups"
      default_groups = [Group::AUTO_GROUPS[:staff]]
      persona.allowed_group_ids = from_setting(setting_name) || default_groups
    elsif persona_class == DiscourseAi::Personas::ContentCreator
      persona.allowed_group_ids = [Group::AUTO_GROUPS[:everyone]]
    else
      persona.allowed_group_ids = [Group::AUTO_GROUPS[:trust_level_0]]
    end
--- a/lib/configuration/feature.rb
+++ b/lib/configuration/feature.rb
@ -144,6 +144,17 @@ module DiscourseAi
          ]
        end
        def embeddings_features
          feature_cache[:embeddings] ||= [
            new(
              "hyde",
              "ai_embeddings_semantic_search_hyde_persona",
              DiscourseAi::Configuration::Module::EMBEDDINGS_ID,
              DiscourseAi::Configuration::Module::EMBEDDINGS,
            ),
          ]
        end
        def lookup_bot_persona_ids
          AiPersona
            .where(enabled: true)
@ -196,6 +207,7 @@ module DiscourseAi
            translation_features,
            bot_features,
            spam_features,
            embeddings_features,
          ].flatten
        end
@ -241,6 +253,8 @@ module DiscourseAi
              DiscourseAi::AiHelper::Assistant.find_ai_helper_model(name, persona_klass)
            when DiscourseAi::Configuration::Module::TRANSLATION
              DiscourseAi::Translation::BaseTranslator.preferred_llm_model(persona_klass)
            when DiscourseAi::Configuration::Module::EMBEDDINGS
              DiscourseAi::Embeddings::SemanticSearch.new(nil).find_ai_hyde_model(persona_klass)
            end
          if llm_model.blank? && persona.default_llm_id
--- a/lib/configuration/module.rb
+++ b/lib/configuration/module.rb
@ -11,8 +11,19 @@ module DiscourseAi
      TRANSLATION = "translation"
      BOT = "bot"
      SPAM = "spam"
      EMBEDDINGS = "embeddings"
-      NAMES = [SUMMARIZATION, SEARCH, DISCORD, INFERENCE, AI_HELPER, TRANSLATION, BOT, SPAM].freeze
+      NAMES = [
        SUMMARIZATION,
        SEARCH,
        DISCORD,
        INFERENCE,
        AI_HELPER,
        TRANSLATION,
        BOT,
        SPAM,
        EMBEDDINGS,
      ].freeze
      SUMMARIZATION_ID = 1
      SEARCH_ID = 2
@ -22,6 +33,7 @@ module DiscourseAi
      TRANSLATION_ID = 6
      BOT_ID = 7
      SPAM_ID = 8
      EMBEDDINGS_ID = 9
      class << self
        def all
@ -75,6 +87,13 @@ module DiscourseAi
              enabled_by_setting: "ai_spam_detection_enabled",
              features: DiscourseAi::Configuration::Feature.spam_features,
            ),
            new(
              EMBEDDINGS_ID,
              EMBEDDINGS,
              enabled_by_setting: "ai_embeddings_enabled",
              features: DiscourseAi::Configuration::Feature.embeddings_features,
              extra_check: -> { SiteSetting.ai_embeddings_semantic_search_enabled },
            ),
          ]
        end
--- a/lib/embeddings/semantic_search.rb
+++ b/lib/embeddings/semantic_search.rb
@ -78,7 +78,9 @@ module DiscourseAi
          return Post.none
        end
-        search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term)
+        search_embedding = nil
        search_embedding = hyde_embedding(search_term) if hyde
        search_embedding = embedding(search_term) if search_embedding.blank?
        over_selection_limit = limit * OVER_SELECTION_FACTOR
@ -176,26 +178,47 @@ module DiscourseAi
      end
      def hypothetical_post_from(search_term)
-        prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
+        context =
-          You are a content creator for a forum. The forum description is as follows:
+          DiscourseAi::Personas::BotContext.new(
-          #{SiteSetting.title}
+            user: @guardian.user,
-          #{SiteSetting.site_description}
+            skip_tool_details: true,
            feature_name: "semantic_search_hyde",
            messages: [{ type: :user, content: search_term }],
          )
-          Put the forum post between <ai></ai> tags.
+        bot = build_bot(@guardian.user)
-        TEXT
+        return nil if bot.nil?
-        prompt.push(type: :user, content: <<~TEXT.strip)
+        structured_output = nil
-          Using this description, write a forum post about the subject inside the <input></input> XML tags:
+        raw_response = +""
        hyde_schema_key = bot.persona.response_format&.first.to_h
-          <input>#{search_term}</input>
+        buffer_blk =
-        TEXT
+          Proc.new do |partial, _, type|
            if type == :structured_output
              structured_output = partial
            elsif type.blank?
              # Assume response is a regular completion.
              raw_response << partial
            end
          end
-        llm_response =
+        bot.reply(context, &buffer_blk)
          DiscourseAi::Completions::Llm.proxy(
            SiteSetting.ai_embeddings_semantic_search_hyde_model,
          ).generate(prompt, user: @guardian.user, feature_name: "semantic_search_hyde")
-        Nokogiri::HTML5.fragment(llm_response).at("ai")&.text.presence || llm_response
+        structured_output&.read_buffered_property(hyde_schema_key["key"]&.to_sym) || raw_response
      end
      # Priorities are:
      #   1. Persona's default LLM
      #   2. `ai_embeddings_semantic_search_hyde_model` setting.
      def find_ai_hyde_model(persona_klass)
        model_id =
          persona_klass.default_llm_id ||
            SiteSetting.ai_embeddings_semantic_search_hyde_model&.split(":")&.last
        return if model_id.blank?
        LlmModel.find_by(id: model_id)
      end
      private
@ -209,6 +232,18 @@ module DiscourseAi
      def build_embedding_key(digest, hyde_model, embedding_model)
        "#{build_hyde_key(digest, hyde_model)}-#{embedding_model}"
      end
      def build_bot(user)
        persona_id = SiteSetting.ai_embeddings_semantic_search_hyde_persona
        persona_klass = AiPersona.find_by(id: persona_id)&.class_instance
        return if persona_klass.nil?
        llm_model = find_ai_hyde_model(persona_klass)
        return if llm_model.nil?
        DiscourseAi::Personas::Bot.as(user, persona: persona_klass.new, model: llm_model)
      end
    end
  end
 end
--- a/lib/personas/bot.rb
+++ b/lib/personas/bot.rb
@ -171,7 +171,7 @@ module DiscourseAi
              text = +""
              result.each { |item| text << item if item.is_a?(String) }
            end
-            raw_context << [text, bot_user.username]
+            raw_context << [text, bot_user&.username]
          end
          total_completions += 1
--- a/lib/personas/content_creator.rb
+++ b/lib/personas/content_creator.rb
@ -0,0 +1,33 @@
 # frozen_string_literal: true
 module DiscourseAi
  module Personas
    class ContentCreator < Persona
      def self.default_enabled
        false
      end
      def system_prompt
        <<~PROMPT.strip
          You are a content creator for a forum. The forum title and description is as follows:
          * Ttitle: {site_title}
          * Description: {site_description}
          You will receive a couple of keywords and must create a post about the keywords, keeping the previous information in mind.
          Format your response as a JSON object with a single key named "output", which has the created content.
          Your output should be in the following format:
            <output>
              {"output": "xx"}
            </output>
          Where "xx" is replaced by the content.
        PROMPT
      end
      def response_format
        [{ "key" => "output", "type" => "string" }]
      end
    end
  end
 end
--- a/lib/personas/persona.rb
+++ b/lib/personas/persona.rb
@ -69,6 +69,7 @@ module DiscourseAi
            TopicTitleTranslator => -29,
            ShortTextTranslator => -30,
            SpamDetector => -31,
            ContentCreator => -32,
          }
        end
--- a/spec/lib/modules/embeddings/semantic_search_spec.rb
+++ b/spec/lib/modules/embeddings/semantic_search_spec.rb
@ -27,7 +27,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
    end
    def trigger_search(query)
-      DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{hypothetical_post}</ai>"]) do
+      DiscourseAi::Completions::Llm.with_prepared_responses([hypothetical_post]) do
        subject.search_for_topics(query)
      end
    end
@ -123,9 +123,9 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
        context "while searching as anon" do
          it "returns an empty list" do
            posts =
-              DiscourseAi::Completions::Llm.with_prepared_responses(
+              DiscourseAi::Completions::Llm.with_prepared_responses([hypothetical_post]) do
-                ["<ai>#{hypothetical_post}</ai>"],
+                described_class.new(Guardian.new(nil)).search_for_topics(query)
-              ) { described_class.new(Guardian.new(nil)).search_for_topics(query) }
+              end
            expect(posts).to be_empty
          end
--- a/spec/lib/personas/tools/search_spec.rb
+++ b/spec/lib/personas/tools/search_spec.rb
@ -125,7 +125,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Search do
        DiscourseAi::Embeddings::Schema.for(Topic).store(post1.topic, hyde_embedding, "digest")
        results =
-          DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
            search.invoke(&progress_blk)
          end
@ -144,7 +144,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Search do
        # results will be expanded by semantic search, but it will find nothing
        results =
-          DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
            search.invoke(&progress_blk)
          end
--- a/spec/lib/utils/search_spec.rb
+++ b/spec/lib/utils/search_spec.rb
@ -154,7 +154,7 @@ RSpec.describe DiscourseAi::Utils::Search do
        # Using a completely different search query, should still find via semantic search
        results =
-          DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
            described_class.perform_search(
              search_query: "totally different query",
              current_user: admin,
--- a/spec/requests/admin/ai_features_controller_spec.rb
+++ b/spec/requests/admin/ai_features_controller_spec.rb
@ -19,7 +19,7 @@ RSpec.describe DiscourseAi::Admin::AiFeaturesController do
      get "/admin/plugins/discourse-ai/ai-features.json"
      expect(response.status).to eq(200)
-      expect(response.parsed_body["ai_features"].count).to eq(8)
+      expect(response.parsed_body["ai_features"].count).to eq(9)
    end
  end
--- a/spec/system/admin_ai_features_spec.rb
+++ b/spec/system/admin_ai_features_spec.rb
@ -28,7 +28,7 @@ RSpec.describe "Admin AI features configuration", type: :system, js: true do
    ai_features_page.toggle_unconfigured
    # this changes as we add more AI features
-    expect(ai_features_page).to have_listed_modules(7)
+    expect(ai_features_page).to have_listed_modules(8)
  end
  it "lists the persona used for the corresponding AI feature" do