FEATURE: Use personas for generating hypothetical posts (#1482)

* FEATURE: Use personas for generating hypothetica posts * Update prompt
2025-07-06 06:22:19 +00:00 · 2025-07-02 10:56:38 -03:00 · 2025-07-02 10:56:38 -03:00 · 75fb37144f
commit 75fb37144f
parent 40fa527633
15 changed files with 158 additions and 30 deletions
--- a/config/locales/client.en.yml
+++ b/config/locales/client.en.yml
@ -222,6 +222,10 @@ en:
          name: "Search"
          description: "Enhances search experience by providing AI-generated answers to queries"
          discoveries: "Discoveries"
+        embeddings:
+          name: "Embeddings"
+          description: "Powers features like Related Topics and AI Search by generating semantic representations of text"
+          hyde: "HyDE"
        discord:
          name: "Discord integration"
          description: "Adds the ability to search Discord channels"
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -394,6 +394,9 @@ en:
        spam_detector:
          name: "Spam detector"
          description: "Default persona powering our Spam detection feature"
+        content_creator:
+          name: "Content creator"
+          description: "Default persona powering HyDE search"

      topic_not_found: "Summary unavailable, topic not found!"
      summarizing: "Summarizing topic"
--- a/config/settings.yml
+++ b/config/settings.yml
@ -222,21 +222,30 @@ discourse_ai:
    default: false
    client: true
    validator: "DiscourseAi::Configuration::EmbeddingsModuleValidator"
+    area: "ai-features/embeddings"
  ai_embeddings_selected_model:
    type: enum
    default: ""
    allow_any: false
    enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
    validator: "DiscourseAi::Configuration::EmbeddingDefsValidator"
+    area: "ai-features/embeddings"
  ai_embeddings_per_post_enabled:
    default: false
    hidden: true
-  ai_embeddings_generate_for_pms: false
+  ai_embeddings_generate_for_pms: 
+    default: false
+    area: "ai-features/embeddings"
  ai_embeddings_semantic_related_topics_enabled:
    default: false
    client: true
-  ai_embeddings_semantic_related_topics: 5
-  ai_embeddings_semantic_related_include_closed_topics: true
+    area: "ai-features/embeddings"
+  ai_embeddings_semantic_related_topics: 
+    default: 5
+    area: "ai-features/embeddings"
+  ai_embeddings_semantic_related_include_closed_topics: 
+    default: true
+    area: "ai-features/embeddings"
  ai_embeddings_backfill_batch_size:
    default: 250
    hidden: true
@ -244,12 +253,14 @@ discourse_ai:
    default: false
    client: true
    validator: "DiscourseAi::Configuration::LlmDependencyValidator"
+    area: "ai-features/embeddings"
  ai_embeddings_semantic_search_hyde_model:
    default: ""
    type: enum
    allow_any: false
    enum: "DiscourseAi::Configuration::LlmEnumerator"
    validator: "DiscourseAi::Configuration::LlmValidator"
+    area: "ai-features/embeddings"
  ai_embeddings_semantic_search_hyde_model_allowed_seeded_models:
    default: ""
    hidden: true
@ -259,6 +270,12 @@ discourse_ai:
    default: false
    client: true
    hidden: true
+    area: "ai-features/embeddings"
+  ai_embeddings_semantic_search_hyde_persona:
+    default: "-32"
+    type: enum
+    enum: "DiscourseAi::Configuration::PersonaEnumerator"
+    area: "ai-features/embeddings"

  ai_embeddings_discourse_service_api_endpoint:
    default: ""
--- a/db/fixtures/personas/603_ai_personas.rb
+++ b/db/fixtures/personas/603_ai_personas.rb
@ -36,6 +36,8 @@ DiscourseAi::Personas::Persona.system_personas.each do |persona_class, id|
      setting_name = "ai_helper_custom_prompts_allowed_groups"
      default_groups = [Group::AUTO_GROUPS[:staff]]
      persona.allowed_group_ids = from_setting(setting_name) || default_groups
+    elsif persona_class == DiscourseAi::Personas::ContentCreator
+      persona.allowed_group_ids = [Group::AUTO_GROUPS[:everyone]]
    else
      persona.allowed_group_ids = [Group::AUTO_GROUPS[:trust_level_0]]
    end
--- a/lib/configuration/feature.rb
+++ b/lib/configuration/feature.rb
@ -144,6 +144,17 @@ module DiscourseAi
          ]
        end

+        def embeddings_features
+          feature_cache[:embeddings] ||= [
+            new(
+              "hyde",
+              "ai_embeddings_semantic_search_hyde_persona",
+              DiscourseAi::Configuration::Module::EMBEDDINGS_ID,
+              DiscourseAi::Configuration::Module::EMBEDDINGS,
+            ),
+          ]
+        end
+
        def lookup_bot_persona_ids
          AiPersona
            .where(enabled: true)
@ -196,6 +207,7 @@ module DiscourseAi
            translation_features,
            bot_features,
            spam_features,
+            embeddings_features,
          ].flatten
        end

@ -241,6 +253,8 @@ module DiscourseAi
              DiscourseAi::AiHelper::Assistant.find_ai_helper_model(name, persona_klass)
            when DiscourseAi::Configuration::Module::TRANSLATION
              DiscourseAi::Translation::BaseTranslator.preferred_llm_model(persona_klass)
+            when DiscourseAi::Configuration::Module::EMBEDDINGS
+              DiscourseAi::Embeddings::SemanticSearch.new(nil).find_ai_hyde_model(persona_klass)
            end

          if llm_model.blank? && persona.default_llm_id
--- a/lib/configuration/module.rb
+++ b/lib/configuration/module.rb
@ -11,8 +11,19 @@ module DiscourseAi
      TRANSLATION = "translation"
      BOT = "bot"
      SPAM = "spam"
+      EMBEDDINGS = "embeddings"

-      NAMES = [SUMMARIZATION, SEARCH, DISCORD, INFERENCE, AI_HELPER, TRANSLATION, BOT, SPAM].freeze
+      NAMES = [
+        SUMMARIZATION,
+        SEARCH,
+        DISCORD,
+        INFERENCE,
+        AI_HELPER,
+        TRANSLATION,
+        BOT,
+        SPAM,
+        EMBEDDINGS,
+      ].freeze

      SUMMARIZATION_ID = 1
      SEARCH_ID = 2
@ -22,6 +33,7 @@ module DiscourseAi
      TRANSLATION_ID = 6
      BOT_ID = 7
      SPAM_ID = 8
+      EMBEDDINGS_ID = 9

      class << self
        def all
@ -75,6 +87,13 @@ module DiscourseAi
              enabled_by_setting: "ai_spam_detection_enabled",
              features: DiscourseAi::Configuration::Feature.spam_features,
            ),
+            new(
+              EMBEDDINGS_ID,
+              EMBEDDINGS,
+              enabled_by_setting: "ai_embeddings_enabled",
+              features: DiscourseAi::Configuration::Feature.embeddings_features,
+              extra_check: -> { SiteSetting.ai_embeddings_semantic_search_enabled },
+            ),
          ]
        end

--- a/lib/embeddings/semantic_search.rb
+++ b/lib/embeddings/semantic_search.rb
@ -78,7 +78,9 @@ module DiscourseAi
          return Post.none
        end

-        search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term)
+        search_embedding = nil
+        search_embedding = hyde_embedding(search_term) if hyde
+        search_embedding = embedding(search_term) if search_embedding.blank?

        over_selection_limit = limit * OVER_SELECTION_FACTOR

@ -176,26 +178,47 @@ module DiscourseAi
      end

      def hypothetical_post_from(search_term)
-        prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
-          You are a content creator for a forum. The forum description is as follows:
-          #{SiteSetting.title}
-          #{SiteSetting.site_description}
+        context =
+          DiscourseAi::Personas::BotContext.new(
+            user: @guardian.user,
+            skip_tool_details: true,
+            feature_name: "semantic_search_hyde",
+            messages: [{ type: :user, content: search_term }],
+          )

-          Put the forum post between <ai></ai> tags.
-        TEXT
+        bot = build_bot(@guardian.user)
+        return nil if bot.nil?

-        prompt.push(type: :user, content: <<~TEXT.strip)
-          Using this description, write a forum post about the subject inside the <input></input> XML tags:
+        structured_output = nil
+        raw_response = +""
+        hyde_schema_key = bot.persona.response_format&.first.to_h

-          <input>#{search_term}</input>
-        TEXT
+        buffer_blk =
+          Proc.new do |partial, _, type|
+            if type == :structured_output
+              structured_output = partial
+            elsif type.blank?
+              # Assume response is a regular completion.
+              raw_response << partial
+            end
+          end

-        llm_response =
-          DiscourseAi::Completions::Llm.proxy(
-            SiteSetting.ai_embeddings_semantic_search_hyde_model,
-          ).generate(prompt, user: @guardian.user, feature_name: "semantic_search_hyde")
+        bot.reply(context, &buffer_blk)

-        Nokogiri::HTML5.fragment(llm_response).at("ai")&.text.presence || llm_response
+        structured_output&.read_buffered_property(hyde_schema_key["key"]&.to_sym) || raw_response
+      end
+
+      # Priorities are:
+      #   1. Persona's default LLM
+      #   2. `ai_embeddings_semantic_search_hyde_model` setting.
+      def find_ai_hyde_model(persona_klass)
+        model_id =
+          persona_klass.default_llm_id ||
+            SiteSetting.ai_embeddings_semantic_search_hyde_model&.split(":")&.last
+
+        return if model_id.blank?
+
+        LlmModel.find_by(id: model_id)
      end

      private
@ -209,6 +232,18 @@ module DiscourseAi
      def build_embedding_key(digest, hyde_model, embedding_model)
        "#{build_hyde_key(digest, hyde_model)}-#{embedding_model}"
      end
+
+      def build_bot(user)
+        persona_id = SiteSetting.ai_embeddings_semantic_search_hyde_persona
+
+        persona_klass = AiPersona.find_by(id: persona_id)&.class_instance
+        return if persona_klass.nil?
+
+        llm_model = find_ai_hyde_model(persona_klass)
+        return if llm_model.nil?
+
+        DiscourseAi::Personas::Bot.as(user, persona: persona_klass.new, model: llm_model)
+      end
    end
  end
 end
--- a/lib/personas/bot.rb
+++ b/lib/personas/bot.rb
@ -171,7 +171,7 @@ module DiscourseAi
              text = +""
              result.each { |item| text << item if item.is_a?(String) }
            end
-            raw_context << [text, bot_user.username]
+            raw_context << [text, bot_user&.username]
          end

          total_completions += 1
--- a/lib/personas/content_creator.rb
+++ b/lib/personas/content_creator.rb
@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Personas
+    class ContentCreator < Persona
+      def self.default_enabled
+        false
+      end
+
+      def system_prompt
+        <<~PROMPT.strip
+          You are a content creator for a forum. The forum title and description is as follows:
+          * Ttitle: {site_title}
+          * Description: {site_description}
+
+          You will receive a couple of keywords and must create a post about the keywords, keeping the previous information in mind.
+
+          Format your response as a JSON object with a single key named "output", which has the created content.
+          Your output should be in the following format:
+            <output>
+              {"output": "xx"}
+            </output>
+
+          Where "xx" is replaced by the content.
+        PROMPT
+      end
+
+      def response_format
+        [{ "key" => "output", "type" => "string" }]
+      end
+    end
+  end
+end
--- a/lib/personas/persona.rb
+++ b/lib/personas/persona.rb
@ -69,6 +69,7 @@ module DiscourseAi
            TopicTitleTranslator => -29,
            ShortTextTranslator => -30,
            SpamDetector => -31,
+            ContentCreator => -32,
          }
        end

--- a/spec/lib/modules/embeddings/semantic_search_spec.rb
+++ b/spec/lib/modules/embeddings/semantic_search_spec.rb
@ -27,7 +27,7 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
    end

    def trigger_search(query)
-      DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{hypothetical_post}</ai>"]) do
+      DiscourseAi::Completions::Llm.with_prepared_responses([hypothetical_post]) do
        subject.search_for_topics(query)
      end
    end
@ -123,9 +123,9 @@ RSpec.describe DiscourseAi::Embeddings::SemanticSearch do
        context "while searching as anon" do
          it "returns an empty list" do
            posts =
-              DiscourseAi::Completions::Llm.with_prepared_responses(
-                ["<ai>#{hypothetical_post}</ai>"],
-              ) { described_class.new(Guardian.new(nil)).search_for_topics(query) }
+              DiscourseAi::Completions::Llm.with_prepared_responses([hypothetical_post]) do
+                described_class.new(Guardian.new(nil)).search_for_topics(query)
+              end

            expect(posts).to be_empty
          end
--- a/spec/lib/personas/tools/search_spec.rb
+++ b/spec/lib/personas/tools/search_spec.rb
@ -125,7 +125,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Search do
        DiscourseAi::Embeddings::Schema.for(Topic).store(post1.topic, hyde_embedding, "digest")

        results =
-          DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
            search.invoke(&progress_blk)
          end

@ -144,7 +144,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Search do

        # results will be expanded by semantic search, but it will find nothing
        results =
-          DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
            search.invoke(&progress_blk)
          end

--- a/spec/lib/utils/search_spec.rb
+++ b/spec/lib/utils/search_spec.rb
@ -154,7 +154,7 @@ RSpec.describe DiscourseAi::Utils::Search do

        # Using a completely different search query, should still find via semantic search
        results =
-          DiscourseAi::Completions::Llm.with_prepared_responses(["<ai>#{query}</ai>"]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([query]) do
            described_class.perform_search(
              search_query: "totally different query",
              current_user: admin,
--- a/spec/requests/admin/ai_features_controller_spec.rb
+++ b/spec/requests/admin/ai_features_controller_spec.rb
@ -19,7 +19,7 @@ RSpec.describe DiscourseAi::Admin::AiFeaturesController do
      get "/admin/plugins/discourse-ai/ai-features.json"

      expect(response.status).to eq(200)
-      expect(response.parsed_body["ai_features"].count).to eq(8)
+      expect(response.parsed_body["ai_features"].count).to eq(9)
    end
  end

--- a/spec/system/admin_ai_features_spec.rb
+++ b/spec/system/admin_ai_features_spec.rb
@ -28,7 +28,7 @@ RSpec.describe "Admin AI features configuration", type: :system, js: true do
    ai_features_page.toggle_unconfigured

    # this changes as we add more AI features
-    expect(ai_features_page).to have_listed_modules(7)
+    expect(ai_features_page).to have_listed_modules(8)
  end

  it "lists the persona used for the corresponding AI feature" do