FEATURE: add context and llm controls to researcher, fix username filter (#1401)

Adds context length controls to researcher (max tokens per post and batch)
Allow picking LLM for researcher
Fix bug where unicode usernames were not working
Fix documentation of OR logic
This commit is contained in:
Sam 2025-06-04 16:39:43 +10:00 committed by GitHub
parent 4f980d5514
commit 3e74eea1e5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 129 additions and 30 deletions

View File

@ -344,6 +344,15 @@ en:
searching: "Searching for: '%{query}'"
tool_options:
researcher:
researcher_llm:
name: "LLM"
description: "Language model to use for research (default to current persona's LLM)"
max_tokens_per_batch:
name: "Maximum tokens per batch"
description: "Maximum number of tokens to use for each batch in the research"
max_tokens_per_post:
name: "Maximum tokens per post"
description: "Maximum number of tokens to use for each post in the research"
max_results:
name: "Maximum number of results"
description: "Maximum number of results to include in a filter"

View File

@ -31,26 +31,28 @@ module DiscourseAi
def filter_description
<<~TEXT
Filter string to target specific content.
- Supports user (@username)
- post_type:first - only includes first posts in topics
- post_type:reply - only replies in topics
- date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
- categories (category:category1,category2 or categories:category1,category2)
- tags (tag:tag1,tag2 or tags:tag1,tag2)
- groups (group:group1,group2 or groups:group1,group2)
- status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
- keywords (keywords:keyword1,keyword2) - searches for specific words within post content using full-text search
- topic_keywords (topic_keywords:keyword1,keyword2) - searches for keywords within topics, returns all posts from matching topics
- topics (topic:topic_id1,topic_id2 or topics:topic_id1,topic_id2) - target specific topics by ID
- max_results (max_results:10) - limits the maximum number of results returned (optional)
- order (order:latest, order:oldest, order:latest_topic, order:oldest_topic, order:likes) - controls result ordering (optional, defaults to latest posts)
Filter string to target specific content. Space-separated filters use AND logic, OR creates separate filter groups.
Multiple filters can be combined with spaces for AND logic. Example: '@sam after:2023-01-01 tag:feature'
**Filters:**
- username:user1 or usernames:user1,user2 - posts by specific users
- group:group1 or groups:group1,group2 - posts by users in specific groups
- post_type:first|reply - first posts only or replies only
- keywords:word1,word2 - full-text search in post content
- topic_keywords:word1,word2 - full-text search in topics (returns all posts from matching topics)
- topic:123 or topics:123,456 - specific topics by ID
- category:name1 or categories:name1,name2 - posts in categories (by name/slug)
- tag:tag1 or tags:tag1,tag2 - posts in topics with tags
- after:YYYY-MM-DD, before:YYYY-MM-DD - filter by post creation date
- topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD - filter by topic creation date
- status:open|closed|archived|noreplies|single_user - topic status filters
- max_results:N - limit results (per OR group)
- order:latest|oldest|latest_topic|oldest_topic|likes - sort order
Use OR to combine filter segments for inclusive logic.
Example: 'category:feature,bug OR tag:feature-tag' - includes posts in feature OR bug categories, OR posts with feature-tag tag
Example: '@sam category:bug' - includes posts by @sam AND in bug category
**OR Logic:** Each OR group processes independently - filters don't cross boundaries.
Examples:
- 'username:sam after:2023-01-01' - sam's posts after date
- 'max_results:50 category:bugs OR tag:urgent' - (50 bug posts) OR (all urgent posts)
TEXT
end
@ -60,9 +62,11 @@ module DiscourseAi
def accepted_options
[
option(:researcher_llm, type: :llm),
option(:max_results, type: :integer),
option(:include_private, type: :boolean),
option(:max_tokens_per_post, type: :integer),
option(:max_tokens_per_batch, type: :integer),
]
end
end
@ -134,17 +138,32 @@ module DiscourseAi
protected
MIN_TOKENS_FOR_RESEARCH = 8000
MIN_TOKENS_FOR_POST = 50
def process_filter(filter, goals, post, &blk)
if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
if researcher_llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
raise ArgumentError,
"LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
end
max_tokens_per_batch = options[:max_tokens_per_batch].to_i
if max_tokens_per_batch <= MIN_TOKENS_FOR_RESEARCH
max_tokens_per_batch = researcher_llm.max_prompt_tokens - 2000
end
max_tokens_per_post = options[:max_tokens_per_post]
if max_tokens_per_post.nil?
max_tokens_per_post = 2000
elsif max_tokens_per_post < MIN_TOKENS_FOR_POST
max_tokens_per_post = MIN_TOKENS_FOR_POST
end
formatter =
DiscourseAi::Utils::Research::LlmFormatter.new(
filter,
max_tokens_per_batch: llm.max_prompt_tokens - 2000,
tokenizer: llm.tokenizer,
max_tokens_per_post: options[:max_tokens_per_post] || 2000,
max_tokens_per_batch: max_tokens_per_batch,
tokenizer: researcher_llm.tokenizer,
max_tokens_per_post: max_tokens_per_post,
)
results = []
@ -164,6 +183,14 @@ module DiscourseAi
end
end
def researcher_llm
@researcher_llm ||=
(
options[:researcher_llm].present? &&
LlmModel.find_by(id: options[:researcher_llm].to_i)&.to_llm
) || self.llm
end
def run_inference(chunk_text, goals, post, &blk)
return if context.cancel_manager&.cancelled?
@ -179,7 +206,7 @@ module DiscourseAi
)
results = []
llm.generate(
researcher_llm.generate(
prompt,
user: post.user,
feature_name: context.feature_name,

View File

@ -153,12 +153,12 @@ module DiscourseAi
end
end
register_filter(/\A\@(\w+)\z/i) do |relation, username, filter|
user = User.find_by(username_lower: username.downcase)
if user
relation.where("posts.user_id = ?", user.id)
register_filter(/\Ausernames?:(.+)\z/i) do |relation, username, filter|
user_ids = User.where(username_lower: username.split(",").map(&:downcase)).pluck(:id)
if user_ids.empty?
relation.where("1 = 0")
else
relation.where("1 = 0") # No results if user doesn't exist
relation.where("posts.user_id IN (?)", user_ids)
end
end

View File

@ -21,6 +21,54 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
before { SiteSetting.ai_bot_enabled = true }
it "uses custom researcher_llm and applies token limits correctly" do
# Create a second LLM model to test the researcher_llm option
secondary_llm_model = Fabricate(:llm_model, name: "secondary_model")
# Create test content with long text to test token truncation
topic = Fabricate(:topic, category: category, tags: [tag_research])
long_content = "zz " * 100 # This will exceed our token limit
_test_post =
Fabricate(:post, topic: topic, raw: long_content, user: user, skip_validation: true)
prompts = nil
responses = [["Research completed"]]
researcher = nil
DiscourseAi::Completions::Llm.with_prepared_responses(
responses,
llm: secondary_llm_model,
) do |_, _, _prompts|
researcher =
described_class.new(
{ filter: "category:research-category", goals: "analyze test content", dry_run: false },
persona_options: {
"researcher_llm" => secondary_llm_model.id,
"max_tokens_per_post" => 50, # Very small to force truncation
"max_tokens_per_batch" => 8000,
},
bot_user: bot_user,
llm: nil,
context: DiscourseAi::Personas::BotContext.new(user: user, post: post),
)
results = researcher.invoke(&progress_blk)
expect(results[:dry_run]).to eq(false)
expect(results[:results]).to be_present
prompts = _prompts
end
expect(prompts).to be_present
user_message = prompts.first.messages.find { |m| m[:type] == :user }
expect(user_message[:content]).to be_present
# count how many times the the "zz " appears in the content (a bit of token magic, we lose a couple cause we redact)
expect(user_message[:content].scan("zz ").count).to eq(48)
end
describe "#invoke" do
it "can correctly filter to a topic id" do
researcher =
@ -104,7 +152,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
researcher =
described_class.new(
{
filter: "category:research-category @#{user.username}",
filter: "category:research-category username:#{user.username}",
goals: "find relevant content",
dry_run: false,
},
@ -129,7 +177,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
expect(results[:dry_run]).to eq(false)
expect(results[:goals]).to eq("find relevant content")
expect(results[:filter]).to eq("category:research-category @#{user.username}")
expect(results[:filter]).to eq("category:research-category username:#{user.username}")
expect(results[:results].first).to include("Found: Relevant content 1")
end
end

View File

@ -144,6 +144,21 @@ describe DiscourseAi::Utils::Research::Filter do
end
end
describe "can find posts by users even with unicode usernames" do
before { SiteSetting.unicode_usernames = true }
let!(:unicode_user) { Fabricate(:user, username: "aאb") }
it "can filter by unicode usernames" do
post = Fabricate(:post, user: unicode_user, topic: feature_topic)
filter = described_class.new("username:aאb")
expect(filter.search.pluck(:id)).to contain_exactly(post.id)
filter = described_class.new("usernames:aאb,#{user.username}")
posts_ids = Post.where(user_id: [unicode_user.id, user.id]).pluck(:id)
expect(filter.search.pluck(:id)).to contain_exactly(*posts_ids)
end
end
describe "category filtering" do
it "correctly filters posts by categories" do
filter = described_class.new("category:Announcements")