discourse-ai/lib/personas/tools/researcher.rb

# frozen_string_literal: true

module DiscourseAi
  module Personas
    module Tools
      class Researcher < Tool
        attr_reader :filter, :result_count, :goals, :dry_run

        class << self
          def signature
            {
              name: name,
              description:
                "Analyze and extract information from content across the forum based on specified filters",
              parameters: [
                { name: "filter", description: filter_description, type: "string" },
                {
                  name: "goals",
                  description:
                    "The specific information you want to extract or analyze from the filtered content, you may specify multiple goals",
                  type: "string",
                },
                {
                  name: "dry_run",
                  description: "When true, only count matching items without processing data",
                  type: "boolean",
                },
              ],
            }
          end

          def filter_description
            <<~TEXT
              Filter string to target specific content.
              - Supports user (@username)
              - date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
              - categories (category:category1,category2)
              - tags (tag:tag1,tag2)
              - groups (group:group1,group2).
              - status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
              - keywords (keywords:keyword1,keyword2) - specific words to search for in posts
              - max_results (max_results:10) the maximum number of results to return (optional)
              - order (order:latest, order:oldest, order:latest_topic, order:oldest_topic) - the order of the results (optional)

              If multiple tags or categories are specified, they are treated as OR conditions.

              Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'
            TEXT
          end

          def name
            "researcher"
          end

          def accepted_options
            [
              option(:max_results, type: :integer),
              option(:include_private, type: :boolean),
              option(:max_tokens_per_post, type: :integer),
            ]
          end
        end

        def invoke(&blk)
          max_results = options[:max_results] || 1000

          @filter = parameters[:filter] || ""
          @goals = parameters[:goals] || ""
          @dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]

          post = Post.find_by(id: context.post_id)
          goals = parameters[:goals] || ""
          dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]

          return { error: "No goals provided" } if goals.blank?
          return { error: "No filter provided" } if @filter.blank?

          guardian = nil
          guardian = Guardian.new(context.user) if options[:include_private]

          filter =
            DiscourseAi::Utils::Research::Filter.new(
              @filter,
              limit: max_results,
              guardian: guardian,
            )
          @result_count = filter.search.count

          blk.call details

          if dry_run
            { dry_run: true, goals: goals, filter: @filter, number_of_results: @result_count }
          else
            process_filter(filter, goals, post, &blk)
          end
        end

        def details
          if @dry_run
            I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args)
          else
            I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args)
          end
        end

        def description_args
          { count: @result_count || 0, filter: @filter || "", goals: @goals || "" }
        end

        protected

        MIN_TOKENS_FOR_RESEARCH = 8000
        def process_filter(filter, goals, post, &blk)
          if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
            raise ArgumentError,
                  "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
          end
          formatter =
            DiscourseAi::Utils::Research::LlmFormatter.new(
              filter,
              max_tokens_per_batch: llm.max_prompt_tokens - 2000,
              tokenizer: llm.tokenizer,
              max_tokens_per_post: options[:max_tokens_per_post] || 2000,
            )

          results = []

          formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goals, post, &blk) }
          { dry_run: false, goals: goals, filter: @filter, results: results }
        end

        def run_inference(chunk_text, goals, post, &blk)
          system_prompt = goal_system_prompt(goals)
          user_prompt = goal_user_prompt(goals, chunk_text)

          prompt =
            DiscourseAi::Completions::Prompt.new(
              system_prompt,
              messages: [{ type: :user, content: user_prompt }],
              post_id: post.id,
              topic_id: post.topic_id,
            )

          results = []
          llm.generate(
            prompt,
            user: post.user,
            feature_name: context.feature_name,
            cancel_manager: context.cancel_manager,
          ) { |partial| results << partial }

          @progress_dots ||= 0
          @progress_dots += 1
          blk.call(details + "\n\n#{"." * @progress_dots}")
          results.join
        end

        def goal_system_prompt(goals)
          <<~TEXT
            You are a researcher tool designed to analyze and extract information from forum content.
            Your task is to process the provided content and extract relevant information based on the specified goal.

            Your goal is: #{goals}
          TEXT
        end

        def goal_user_prompt(goals, chunk_text)
          <<~TEXT
            Here is the content to analyze:

            {{{
            #{chunk_text}
            }}}

            Your goal is: #{goals}
           TEXT
        end
      end
    end
  end
end
FEATURE: forum researcher persona for deep research (#1313) This commit introduces a new Forum Researcher persona specialized in deep forum content analysis along with comprehensive improvements to our AI infrastructure. Key additions: New Forum Researcher persona with advanced filtering and analysis capabilities Robust filtering system supporting tags, categories, dates, users, and keywords LLM formatter to efficiently process and chunk research results Infrastructure improvements: Implemented CancelManager class to centrally manage AI completion cancellations Replaced callback-based cancellation with a more robust pattern Added systematic cancellation monitoring with callbacks Other improvements: Added configurable default_enabled flag to control which personas are enabled by default Updated translation strings for the new researcher functionality Added comprehensive specs for the new components Renames Researcher -> Web Researcher This change makes our AI platform more stable while adding powerful research capabilities that can analyze forum trends and surface relevant content. 2025-05-14 12:36:16 +10:00			`# frozen_string_literal: true`

			`module DiscourseAi`
			`module Personas`
			`module Tools`
			`class Researcher < Tool`
			`attr_reader :filter, :result_count, :goals, :dry_run`

			`class << self`
			`def signature`
			`{`
			`name: name,`
			`description:`
			`"Analyze and extract information from content across the forum based on specified filters",`
			`parameters: [`
			`{ name: "filter", description: filter_description, type: "string" },`
			`{`
			`name: "goals",`
			`description:`
			`"The specific information you want to extract or analyze from the filtered content, you may specify multiple goals",`
			`type: "string",`
			`},`
			`{`
			`name: "dry_run",`
			`description: "When true, only count matching items without processing data",`
			`type: "boolean",`
			`},`
			`],`
			`}`
			`end`

			`def filter_description`
			`<<~TEXT`
			`Filter string to target specific content.`
			`- Supports user (@username)`
			`- date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)`
			`- categories (category:category1,category2)`
			`- tags (tag:tag1,tag2)`
			`- groups (group:group1,group2).`
			`- status (status:open, status:closed, status:archived, status:noreplies, status:single_user)`
			`- keywords (keywords:keyword1,keyword2) - specific words to search for in posts`
			`- max_results (max_results:10) the maximum number of results to return (optional)`
			`- order (order:latest, order:oldest, order:latest_topic, order:oldest_topic) - the order of the results (optional)`

			`If multiple tags or categories are specified, they are treated as OR conditions.`

			`Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'`
			`TEXT`
			`end`

			`def name`
			`"researcher"`
			`end`

			`def accepted_options`
			`[`
			`option(:max_results, type: :integer),`
			`option(:include_private, type: :boolean),`
			`option(:max_tokens_per_post, type: :integer),`
			`]`
			`end`
			`end`

			`def invoke(&blk)`
			`max_results = options[:max_results] \|\| 1000`

			`@filter = parameters[:filter] \|\| ""`
			`@goals = parameters[:goals] \|\| ""`
			`@dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]`

			`post = Post.find_by(id: context.post_id)`
			`goals = parameters[:goals] \|\| ""`
			`dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]`

			`return { error: "No goals provided" } if goals.blank?`
			`return { error: "No filter provided" } if @filter.blank?`

			`guardian = nil`
			`guardian = Guardian.new(context.user) if options[:include_private]`

			`filter =`
			`DiscourseAi::Utils::Research::Filter.new(`
			`@filter,`
			`limit: max_results,`
			`guardian: guardian,`
			`)`
			`@result_count = filter.search.count`

			`blk.call details`

			`if dry_run`
			`{ dry_run: true, goals: goals, filter: @filter, number_of_results: @result_count }`
			`else`
			`process_filter(filter, goals, post, &blk)`
			`end`
			`end`

			`def details`
			`if @dry_run`
			`I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args)`
			`else`
			`I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args)`
			`end`
			`end`

			`def description_args`
			`{ count: @result_count \|\| 0, filter: @filter \|\| "", goals: @goals \|\| "" }`
			`end`

			`protected`

			`MIN_TOKENS_FOR_RESEARCH = 8000`
			`def process_filter(filter, goals, post, &blk)`
			`if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH`
			`raise ArgumentError,`
			`"LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."`
			`end`
			`formatter =`
			`DiscourseAi::Utils::Research::LlmFormatter.new(`
			`filter,`
			`max_tokens_per_batch: llm.max_prompt_tokens - 2000,`
			`tokenizer: llm.tokenizer,`
			`max_tokens_per_post: options[:max_tokens_per_post] \|\| 2000,`
			`)`

			`results = []`

			`formatter.each_chunk { \|chunk\| results << run_inference(chunk[:text], goals, post, &blk) }`
			`{ dry_run: false, goals: goals, filter: @filter, results: results }`
			`end`

			`def run_inference(chunk_text, goals, post, &blk)`
			`system_prompt = goal_system_prompt(goals)`
			`user_prompt = goal_user_prompt(goals, chunk_text)`

			`prompt =`
			`DiscourseAi::Completions::Prompt.new(`
			`system_prompt,`
			`messages: [{ type: :user, content: user_prompt }],`
			`post_id: post.id,`
			`topic_id: post.topic_id,`
			`)`

			`results = []`
			`llm.generate(`
			`prompt,`
			`user: post.user,`
			`feature_name: context.feature_name,`
			`cancel_manager: context.cancel_manager,`
			`) { \|partial\| results << partial }`

			`@progress_dots \|\|= 0`
			`@progress_dots += 1`
			`blk.call(details + "\n\n#{"." * @progress_dots}")`
			`results.join`
			`end`

			`def goal_system_prompt(goals)`
			`<<~TEXT`
			`You are a researcher tool designed to analyze and extract information from forum content.`
			`Your task is to process the provided content and extract relevant information based on the specified goal.`

			`Your goal is: #{goals}`
			`TEXT`
			`end`

			`def goal_user_prompt(goals, chunk_text)`
			`<<~TEXT`
			`Here is the content to analyze:`

			`{{{`
			`#{chunk_text}`
			`}}}`

			`Your goal is: #{goals}`
			`TEXT`
			`end`
			`end`
			`end`
			`end`
			`end`