discourse-ai/lib/personas/tools/researcher.rb

# frozen_string_literal: true

module DiscourseAi
  module Personas
    module Tools
      class Researcher < Tool
        attr_reader :filter, :result_count, :goals, :dry_run

        class << self
          def signature
            {
              name: name,
              description:
                "Analyze and extract information from content across the forum based on specified filters",
              parameters: [
                { name: "filter", description: filter_description, type: "string" },
                {
                  name: "goals",
                  description:
                    "The specific information you want to extract or analyze from the filtered content, you may specify multiple goals",
                  type: "string",
                },
                {
                  name: "dry_run",
                  description: "When true, only count matching posts without processing data",
                  type: "boolean",
                },
              ],
            }
          end

          def filter_description
            <<~TEXT
              Filter string to target specific content. Space-separated filters use AND logic, OR creates separate filter groups.

              **Filters:**
              - username:user1 or usernames:user1,user2 - posts by specific users
              - group:group1 or groups:group1,group2 - posts by users in specific groups
              - post_type:first|reply - first posts only or replies only
              - keywords:word1,word2 - full-text search in post content
              - topic_keywords:word1,word2 - full-text search in topics (returns all posts from matching topics)
              - topic:123 or topics:123,456 - specific topics by ID
              - category:name1 or categories:name1,name2 - posts in categories (by name/slug)
              - tag:tag1 or tags:tag1,tag2 - posts in topics with tags
              - after:YYYY-MM-DD, before:YYYY-MM-DD - filter by post creation date
              - topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD - filter by topic creation date
              - status:open|closed|archived|noreplies|single_user - topic status filters
              - max_results:N - limit results (per OR group)
              - order:latest|oldest|latest_topic|oldest_topic|likes - sort order
              #{assign_tip}

              **OR Logic:** Each OR group processes independently - filters don't cross boundaries.

              Examples:
              - 'username:sam after:2023-01-01' - sam's posts after date
              - 'max_results:50 category:bugs OR tag:urgent' - (≤50 bug posts) OR (all urgent posts)
            TEXT
          end

          def assign_tip
            if SiteSetting.respond_to?(:assign_enabled) && SiteSetting.assign_enabled
              (<<~TEXT).strip
                assigned_to:username or assigned_to:username1,username2 - topics assigned to a specific user
                assigned_to:* - topics assigned to any user
                assigned_to:nobody - topics not assigned to any user
              TEXT
            end
          end

          def name
            "researcher"
          end

          def accepted_options
            [
              option(:researcher_llm, type: :llm),
              option(:max_results, type: :integer),
              option(:include_private, type: :boolean),
              option(:max_tokens_per_post, type: :integer),
              option(:max_tokens_per_batch, type: :integer),
            ]
          end
        end

        def invoke(&blk)
          max_results = options[:max_results] || 1000

          @filter = parameters[:filter] || ""
          @goals = parameters[:goals] || ""
          @dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]

          post = Post.find_by(id: context.post_id)
          goals = parameters[:goals] || ""
          dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]

          return { error: "No goals provided" } if goals.blank?
          return { error: "No filter provided" } if @filter.blank?

          guardian = nil
          guardian = Guardian.new(context.user) if options[:include_private]

          filter =
            DiscourseAi::Utils::Research::Filter.new(
              @filter,
              limit: max_results,
              guardian: guardian,
            )

          if filter.invalid_filters.present?
            return(
              {
                error:
                  "Invalid filter fragment: #{filter.invalid_filters.join(" ")}\n\n#{self.class.filter_description}",
              }
            )
          end

          @result_count = filter.search.count

          blk.call details

          if dry_run
            { dry_run: true, goals: goals, filter: @filter, number_of_posts: @result_count }
          else
            process_filter(filter, goals, post, &blk)
          end
        rescue StandardError => e
          { error: "Error processing research: #{e.message}" }
        end

        def details
          if @dry_run
            I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args)
          else
            I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args)
          end
        end

        def summary
          if @dry_run
            I18n.t("discourse_ai.ai_bot.tool_summary.researcher_dry_run")
          else
            I18n.t("discourse_ai.ai_bot.tool_summary.researcher")
          end
        end

        def description_args
          { count: @result_count || 0, filter: @filter || "", goals: @goals || "" }
        end

        protected

        MIN_TOKENS_FOR_RESEARCH = 8000
        MIN_TOKENS_FOR_POST = 50

        def process_filter(filter, goals, post, &blk)
          if researcher_llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
            raise ArgumentError,
                  "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
          end

          max_tokens_per_batch = options[:max_tokens_per_batch].to_i
          if max_tokens_per_batch <= MIN_TOKENS_FOR_RESEARCH
            max_tokens_per_batch = researcher_llm.max_prompt_tokens - 2000
          end

          max_tokens_per_post = options[:max_tokens_per_post]
          if max_tokens_per_post.nil?
            max_tokens_per_post = 2000
          elsif max_tokens_per_post < MIN_TOKENS_FOR_POST
            max_tokens_per_post = MIN_TOKENS_FOR_POST
          end

          formatter =
            DiscourseAi::Utils::Research::LlmFormatter.new(
              filter,
              max_tokens_per_batch: max_tokens_per_batch,
              tokenizer: researcher_llm.tokenizer,
              max_tokens_per_post: max_tokens_per_post,
            )

          results = []

          formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goals, post, &blk) }

          if context.cancel_manager&.cancelled?
            {
              dry_run: false,
              goals: goals,
              filter: @filter,
              results: "Cancelled by user",
              cancelled_by_user: true,
            }
          else
            { dry_run: false, goals: goals, filter: @filter, results: results }
          end
        end

        def researcher_llm
          @researcher_llm ||=
            (
              options[:researcher_llm].present? &&
                LlmModel.find_by(id: options[:researcher_llm].to_i)&.to_llm
            ) || self.llm
        end

        def run_inference(chunk_text, goals, post, &blk)
          return if context.cancel_manager&.cancelled?

          system_prompt = goal_system_prompt(goals)
          user_prompt = goal_user_prompt(goals, chunk_text)

          prompt =
            DiscourseAi::Completions::Prompt.new(
              system_prompt,
              messages: [{ type: :user, content: user_prompt }],
              post_id: post.id,
              topic_id: post.topic_id,
            )

          results = []
          researcher_llm.generate(
            prompt,
            user: post.user,
            feature_name: context.feature_name,
            cancel_manager: context.cancel_manager,
          ) { |partial| results << partial }

          @progress_dots ||= 0
          @progress_dots += 1
          blk.call(details + "\n\n#{"." * @progress_dots}")
          results.join
        end

        def goal_system_prompt(goals)
          <<~TEXT
            You are a researcher tool designed to analyze and extract information from forum content on #{Discourse.base_url}.
            The current date is #{::Time.zone.now.strftime("%a, %d %b %Y %H:%M %Z")}.
            Your task is to process the provided content and extract relevant information based on the specified goal.
            When extracting content ALWAYS include the following:
             - Multiple citations using Markdown
               - Topic citations: Interesting fact [ref](/t/-/TOPIC_ID)
               - Post citations: Interesting fact [ref](/t/-/TOPIC_ID/POST_NUMBER)
             - Relevent quotes from the direct source content
             - Relevant dates and times from the content

            Your goal is: #{goals}
          TEXT
        end

        def goal_user_prompt(goals, chunk_text)
          <<~TEXT
            Here is the content to analyze:

            {{{
            #{chunk_text}
            }}}

            Your goal is: #{goals}
           TEXT
        end
      end
    end
  end
end