discourse-ai/lib/personas/tools/researcher.rb
Sam 4dffd0b2c5
DEV: improve tool infra, improve forum researcher prompts, improve logging (#1391)
- add sleep function for tool polling with rate limits
- Support base64 encoding for HTTP requests and uploads
-  Enhance forum researcher with cost warnings and comprehensive planning
- Add cancellation support for research operations
- Include feature_name parameter for bot analytics
- richer research support (OR queries)
2025-06-03 15:17:55 +10:00

226 lines
8.1 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Personas
module Tools
class Researcher < Tool
attr_reader :filter, :result_count, :goals, :dry_run
class << self
def signature
{
name: name,
description:
"Analyze and extract information from content across the forum based on specified filters",
parameters: [
{ name: "filter", description: filter_description, type: "string" },
{
name: "goals",
description:
"The specific information you want to extract or analyze from the filtered content, you may specify multiple goals",
type: "string",
},
{
name: "dry_run",
description: "When true, only count matching posts without processing data",
type: "boolean",
},
],
}
end
def filter_description
<<~TEXT
Filter string to target specific content.
- Supports user (@username)
- post_type:first - only includes first posts in topics
- post_type:reply - only replies in topics
- date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
- categories (category:category1,category2 or categories:category1,category2)
- tags (tag:tag1,tag2 or tags:tag1,tag2)
- groups (group:group1,group2 or groups:group1,group2)
- status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
- keywords (keywords:keyword1,keyword2) - searches for specific words within post content using full-text search
- topic_keywords (topic_keywords:keyword1,keyword2) - searches for keywords within topics, returns all posts from matching topics
- topics (topic:topic_id1,topic_id2 or topics:topic_id1,topic_id2) - target specific topics by ID
- max_results (max_results:10) - limits the maximum number of results returned (optional)
- order (order:latest, order:oldest, order:latest_topic, order:oldest_topic, order:likes) - controls result ordering (optional, defaults to latest posts)
Multiple filters can be combined with spaces for AND logic. Example: '@sam after:2023-01-01 tag:feature'
Use OR to combine filter segments for inclusive logic.
Example: 'category:feature,bug OR tag:feature-tag' - includes posts in feature OR bug categories, OR posts with feature-tag tag
Example: '@sam category:bug' - includes posts by @sam AND in bug category
TEXT
end
def name
"researcher"
end
def accepted_options
[
option(:max_results, type: :integer),
option(:include_private, type: :boolean),
option(:max_tokens_per_post, type: :integer),
]
end
end
def invoke(&blk)
max_results = options[:max_results] || 1000
@filter = parameters[:filter] || ""
@goals = parameters[:goals] || ""
@dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
post = Post.find_by(id: context.post_id)
goals = parameters[:goals] || ""
dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
return { error: "No goals provided" } if goals.blank?
return { error: "No filter provided" } if @filter.blank?
guardian = nil
guardian = Guardian.new(context.user) if options[:include_private]
filter =
DiscourseAi::Utils::Research::Filter.new(
@filter,
limit: max_results,
guardian: guardian,
)
if filter.invalid_filters.present?
return(
{
error:
"Invalid filter fragment: #{filter.invalid_filters.join(" ")}\n\n#{self.class.filter_description}",
}
)
end
@result_count = filter.search.count
blk.call details
if dry_run
{ dry_run: true, goals: goals, filter: @filter, number_of_posts: @result_count }
else
process_filter(filter, goals, post, &blk)
end
end
def details
if @dry_run
I18n.t("discourse_ai.ai_bot.tool_description.researcher_dry_run", description_args)
else
I18n.t("discourse_ai.ai_bot.tool_description.researcher", description_args)
end
end
def summary
if @dry_run
I18n.t("discourse_ai.ai_bot.tool_summary.researcher_dry_run")
else
I18n.t("discourse_ai.ai_bot.tool_summary.researcher")
end
end
def description_args
{ count: @result_count || 0, filter: @filter || "", goals: @goals || "" }
end
protected
MIN_TOKENS_FOR_RESEARCH = 8000
def process_filter(filter, goals, post, &blk)
if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
raise ArgumentError,
"LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
end
formatter =
DiscourseAi::Utils::Research::LlmFormatter.new(
filter,
max_tokens_per_batch: llm.max_prompt_tokens - 2000,
tokenizer: llm.tokenizer,
max_tokens_per_post: options[:max_tokens_per_post] || 2000,
)
results = []
formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goals, post, &blk) }
if context.cancel_manager&.cancelled?
{
dry_run: false,
goals: goals,
filter: @filter,
results: "Cancelled by user",
cancelled_by_user: true,
}
else
{ dry_run: false, goals: goals, filter: @filter, results: results }
end
end
def run_inference(chunk_text, goals, post, &blk)
return if context.cancel_manager&.cancelled?
system_prompt = goal_system_prompt(goals)
user_prompt = goal_user_prompt(goals, chunk_text)
prompt =
DiscourseAi::Completions::Prompt.new(
system_prompt,
messages: [{ type: :user, content: user_prompt }],
post_id: post.id,
topic_id: post.topic_id,
)
results = []
llm.generate(
prompt,
user: post.user,
feature_name: context.feature_name,
cancel_manager: context.cancel_manager,
) { |partial| results << partial }
@progress_dots ||= 0
@progress_dots += 1
blk.call(details + "\n\n#{"." * @progress_dots}")
results.join
end
def goal_system_prompt(goals)
<<~TEXT
You are a researcher tool designed to analyze and extract information from forum content on #{Discourse.base_url}.
The current date is #{::Time.zone.now.strftime("%a, %d %b %Y %H:%M %Z")}.
Your task is to process the provided content and extract relevant information based on the specified goal.
When extracting content ALWAYS include the following:
- Multiple citations using Markdown
- Topic citations: Interesting fact [ref](/t/-/TOPIC_ID)
- Post citations: Interesting fact [ref](/t/-/TOPIC_ID/POST_NUMBER)
- Relevent quotes from the direct source content
- Relevant dates and times from the content
Your goal is: #{goals}
TEXT
end
def goal_user_prompt(goals, chunk_text)
<<~TEXT
Here is the content to analyze:
{{{
#{chunk_text}
}}}
Your goal is: #{goals}
TEXT
end
end
end
end
end