discourse-ai/lib/personas/tools/researcher.rb

# frozen_string_literal: true

module DiscourseAi
  module Personas
    module Tools
      class Researcher < Tool
        attr_reader :filter, :result_count, :goals, :dry_run

        class << self
          def signature
            {
              name: name,
              description:
                "Analyze and extract information from content across the forum based on specified filters",
              parameters: [
                { name: "filter", description: filter_description, type: "string" },
                {
                  name: "goals",
                  description:
                    "The specific information you want to extract or analyze from the filtered content, you may specify multiple goals",
                  type: "string",
                },
                {
                  name: "dry_run",
                  description: "When true, only count matching posts without processing data",
                  type: "boolean",
                },
              ],
            }
          end

          def filter_description
            <<~TEXT
              Filter string to target specific content.
              - Supports user (@username)
              - post_type:first - only includes first posts in topics
              - post_type:reply - only replies in topics
              - date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
              - categories (category:category1,category2 or categories:category1,category2)
              - tags (tag:tag1,tag2 or tags:tag1,tag2)
              - groups (group:group1,group2 or groups:group1,group2)
              - status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
              - keywords (keywords:keyword1,keyword2) - searches for specific words within post content using full-text search
              - topic_keywords (topic_keywords:keyword1,keyword2) - searches for keywords within topics, returns all posts from matching topics
              - topics (topic:topic_id1,topic_id2 or topics:topic_id1,topic_id2) - target specific topics by ID
              - max_results (max_results:10) - limits the maximum number of results returned (optional)
              - order (order:latest, order:oldest, order:latest_topic, order:oldest_topic, order:likes) - controls result ordering (optional, defaults to latest posts)

              Multiple filters can be combined with spaces for AND logic. Example: '@sam after:2023-01-01 tag:feature'

              Use OR to combine filter segments for inclusive logic.
              Example: 'category:feature,bug OR tag:feature-tag' - includes posts in feature OR bug categories, OR posts with feature-tag tag
              Example: '@sam category:bug' - includes posts by @sam AND in bug category
            TEXT
          end

          def name
            "researcher"
          end

          def accepted_options
            [
              option(:max_results, type: :integer),
              option(:include_private, type: :boolean),
              option(:max_tokens_per_post, type: :integer),
            ]
          end
        end

        def invoke(&blk)
          max_results = options[:max_results] || 1000

          @filter = parameters[:filter] || ""
          @goals = parameters[:goals] || ""
          @dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]

          post = Post.find_by(id: context.post_id)
          goals = parameters[:goals] || ""
          dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]

          return { error: "No goals provided" } if goals.blank?
          return { error: "No filter provided" } if @filter.blank?

          guardian = nil
          guardian = Guardian.new(context.user) if options[:include_private]

          filter =
            DiscourseAi::Utils::Research::Filter.new(
              @filter,
              limit: max_results,
              guardian: guardian,
            )

          if filter.invalid_filters.present?
            return(
              {
                error:
                  "Invalid filter fragment: #{filter.invalid_filters.join(" ")}\n\n#{self.class.filter_description}",
              }
            )
          end

          @result_count = filter.search.count

          blk.call details

          if dry_run
            { dry_run: true, goals: goals, filter: @filter, number_of_posts: @result_count }
          else
            process_filter(filter, goals, post, &blk)
          end
        end

        def details
          if @dry_run
            I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args)
          else
            I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args)
          end
        end

        def summary
          if @dry_run
            I18n.t("discourse_ai.ai_bot.tool_summary.researcher_dry_run")
          else
            I18n.t("discourse_ai.ai_bot.tool_summary.researcher")
          end
        end

        def description_args
          { count: @result_count || 0, filter: @filter || "", goals: @goals || "" }
        end

        protected

        MIN_TOKENS_FOR_RESEARCH = 8000
        def process_filter(filter, goals, post, &blk)
          if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
            raise ArgumentError,
                  "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
          end
          formatter =
            DiscourseAi::Utils::Research::LlmFormatter.new(
              filter,
              max_tokens_per_batch: llm.max_prompt_tokens - 2000,
              tokenizer: llm.tokenizer,
              max_tokens_per_post: options[:max_tokens_per_post] || 2000,
            )

          results = []

          formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goals, post, &blk) }

          if context.cancel_manager&.cancelled?
            {
              dry_run: false,
              goals: goals,
              filter: @filter,
              results: "Cancelled by user",
              cancelled_by_user: true,
            }
          else
            { dry_run: false, goals: goals, filter: @filter, results: results }
          end
        end

        def run_inference(chunk_text, goals, post, &blk)
          return if context.cancel_manager&.cancelled?

          system_prompt = goal_system_prompt(goals)
          user_prompt = goal_user_prompt(goals, chunk_text)

          prompt =
            DiscourseAi::Completions::Prompt.new(
              system_prompt,
              messages: [{ type: :user, content: user_prompt }],
              post_id: post.id,
              topic_id: post.topic_id,
            )

          results = []
          llm.generate(
            prompt,
            user: post.user,
            feature_name: context.feature_name,
            cancel_manager: context.cancel_manager,
          ) { |partial| results << partial }

          @progress_dots ||= 0
          @progress_dots += 1
          blk.call(details + "\n\n#{"." * @progress_dots}")
          results.join
        end

        def goal_system_prompt(goals)
          <<~TEXT
            You are a researcher tool designed to analyze and extract information from forum content on #{Discourse.base_url}.
            The current date is #{::Time.zone.now.strftime("%a, %d %b %Y %H:%M %Z")}.
            Your task is to process the provided content and extract relevant information based on the specified goal.
            When extracting content ALWAYS include the following:
             - Multiple citations using Markdown
               - Topic citations: Interesting fact [ref](/t/-/TOPIC_ID)
               - Post citations: Interesting fact [ref](/t/-/TOPIC_ID/POST_NUMBER)
             - Relevent quotes from the direct source content
             - Relevant dates and times from the content

            Your goal is: #{goals}
          TEXT
        end

        def goal_user_prompt(goals, chunk_text)
          <<~TEXT
            Here is the content to analyze:

            {{{
            #{chunk_text}
            }}}

            Your goal is: #{goals}
           TEXT
        end
      end
    end
  end
end