mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-07-14 01:53:27 +00:00
FEATURE: add context and llm controls to researcher, fix username filter (#1401)
Adds context length controls to researcher (max tokens per post and batch) Allow picking LLM for researcher Fix bug where unicode usernames were not working Fix documentation of OR logic
This commit is contained in:
parent
4f980d5514
commit
3e74eea1e5
@ -344,6 +344,15 @@ en:
|
||||
searching: "Searching for: '%{query}'"
|
||||
tool_options:
|
||||
researcher:
|
||||
researcher_llm:
|
||||
name: "LLM"
|
||||
description: "Language model to use for research (default to current persona's LLM)"
|
||||
max_tokens_per_batch:
|
||||
name: "Maximum tokens per batch"
|
||||
description: "Maximum number of tokens to use for each batch in the research"
|
||||
max_tokens_per_post:
|
||||
name: "Maximum tokens per post"
|
||||
description: "Maximum number of tokens to use for each post in the research"
|
||||
max_results:
|
||||
name: "Maximum number of results"
|
||||
description: "Maximum number of results to include in a filter"
|
||||
|
@ -31,26 +31,28 @@ module DiscourseAi
|
||||
|
||||
def filter_description
|
||||
<<~TEXT
|
||||
Filter string to target specific content.
|
||||
- Supports user (@username)
|
||||
- post_type:first - only includes first posts in topics
|
||||
- post_type:reply - only replies in topics
|
||||
- date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
|
||||
- categories (category:category1,category2 or categories:category1,category2)
|
||||
- tags (tag:tag1,tag2 or tags:tag1,tag2)
|
||||
- groups (group:group1,group2 or groups:group1,group2)
|
||||
- status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
|
||||
- keywords (keywords:keyword1,keyword2) - searches for specific words within post content using full-text search
|
||||
- topic_keywords (topic_keywords:keyword1,keyword2) - searches for keywords within topics, returns all posts from matching topics
|
||||
- topics (topic:topic_id1,topic_id2 or topics:topic_id1,topic_id2) - target specific topics by ID
|
||||
- max_results (max_results:10) - limits the maximum number of results returned (optional)
|
||||
- order (order:latest, order:oldest, order:latest_topic, order:oldest_topic, order:likes) - controls result ordering (optional, defaults to latest posts)
|
||||
Filter string to target specific content. Space-separated filters use AND logic, OR creates separate filter groups.
|
||||
|
||||
Multiple filters can be combined with spaces for AND logic. Example: '@sam after:2023-01-01 tag:feature'
|
||||
**Filters:**
|
||||
- username:user1 or usernames:user1,user2 - posts by specific users
|
||||
- group:group1 or groups:group1,group2 - posts by users in specific groups
|
||||
- post_type:first|reply - first posts only or replies only
|
||||
- keywords:word1,word2 - full-text search in post content
|
||||
- topic_keywords:word1,word2 - full-text search in topics (returns all posts from matching topics)
|
||||
- topic:123 or topics:123,456 - specific topics by ID
|
||||
- category:name1 or categories:name1,name2 - posts in categories (by name/slug)
|
||||
- tag:tag1 or tags:tag1,tag2 - posts in topics with tags
|
||||
- after:YYYY-MM-DD, before:YYYY-MM-DD - filter by post creation date
|
||||
- topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD - filter by topic creation date
|
||||
- status:open|closed|archived|noreplies|single_user - topic status filters
|
||||
- max_results:N - limit results (per OR group)
|
||||
- order:latest|oldest|latest_topic|oldest_topic|likes - sort order
|
||||
|
||||
Use OR to combine filter segments for inclusive logic.
|
||||
Example: 'category:feature,bug OR tag:feature-tag' - includes posts in feature OR bug categories, OR posts with feature-tag tag
|
||||
Example: '@sam category:bug' - includes posts by @sam AND in bug category
|
||||
**OR Logic:** Each OR group processes independently - filters don't cross boundaries.
|
||||
|
||||
Examples:
|
||||
- 'username:sam after:2023-01-01' - sam's posts after date
|
||||
- 'max_results:50 category:bugs OR tag:urgent' - (≤50 bug posts) OR (all urgent posts)
|
||||
TEXT
|
||||
end
|
||||
|
||||
@ -60,9 +62,11 @@ module DiscourseAi
|
||||
|
||||
def accepted_options
|
||||
[
|
||||
option(:researcher_llm, type: :llm),
|
||||
option(:max_results, type: :integer),
|
||||
option(:include_private, type: :boolean),
|
||||
option(:max_tokens_per_post, type: :integer),
|
||||
option(:max_tokens_per_batch, type: :integer),
|
||||
]
|
||||
end
|
||||
end
|
||||
@ -134,17 +138,32 @@ module DiscourseAi
|
||||
protected
|
||||
|
||||
MIN_TOKENS_FOR_RESEARCH = 8000
|
||||
MIN_TOKENS_FOR_POST = 50
|
||||
|
||||
def process_filter(filter, goals, post, &blk)
|
||||
if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
|
||||
if researcher_llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
|
||||
raise ArgumentError,
|
||||
"LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
|
||||
end
|
||||
|
||||
max_tokens_per_batch = options[:max_tokens_per_batch].to_i
|
||||
if max_tokens_per_batch <= MIN_TOKENS_FOR_RESEARCH
|
||||
max_tokens_per_batch = researcher_llm.max_prompt_tokens - 2000
|
||||
end
|
||||
|
||||
max_tokens_per_post = options[:max_tokens_per_post]
|
||||
if max_tokens_per_post.nil?
|
||||
max_tokens_per_post = 2000
|
||||
elsif max_tokens_per_post < MIN_TOKENS_FOR_POST
|
||||
max_tokens_per_post = MIN_TOKENS_FOR_POST
|
||||
end
|
||||
|
||||
formatter =
|
||||
DiscourseAi::Utils::Research::LlmFormatter.new(
|
||||
filter,
|
||||
max_tokens_per_batch: llm.max_prompt_tokens - 2000,
|
||||
tokenizer: llm.tokenizer,
|
||||
max_tokens_per_post: options[:max_tokens_per_post] || 2000,
|
||||
max_tokens_per_batch: max_tokens_per_batch,
|
||||
tokenizer: researcher_llm.tokenizer,
|
||||
max_tokens_per_post: max_tokens_per_post,
|
||||
)
|
||||
|
||||
results = []
|
||||
@ -164,6 +183,14 @@ module DiscourseAi
|
||||
end
|
||||
end
|
||||
|
||||
def researcher_llm
|
||||
@researcher_llm ||=
|
||||
(
|
||||
options[:researcher_llm].present? &&
|
||||
LlmModel.find_by(id: options[:researcher_llm].to_i)&.to_llm
|
||||
) || self.llm
|
||||
end
|
||||
|
||||
def run_inference(chunk_text, goals, post, &blk)
|
||||
return if context.cancel_manager&.cancelled?
|
||||
|
||||
@ -179,7 +206,7 @@ module DiscourseAi
|
||||
)
|
||||
|
||||
results = []
|
||||
llm.generate(
|
||||
researcher_llm.generate(
|
||||
prompt,
|
||||
user: post.user,
|
||||
feature_name: context.feature_name,
|
||||
|
@ -153,12 +153,12 @@ module DiscourseAi
|
||||
end
|
||||
end
|
||||
|
||||
register_filter(/\A\@(\w+)\z/i) do |relation, username, filter|
|
||||
user = User.find_by(username_lower: username.downcase)
|
||||
if user
|
||||
relation.where("posts.user_id = ?", user.id)
|
||||
register_filter(/\Ausernames?:(.+)\z/i) do |relation, username, filter|
|
||||
user_ids = User.where(username_lower: username.split(",").map(&:downcase)).pluck(:id)
|
||||
if user_ids.empty?
|
||||
relation.where("1 = 0")
|
||||
else
|
||||
relation.where("1 = 0") # No results if user doesn't exist
|
||||
relation.where("posts.user_id IN (?)", user_ids)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -21,6 +21,54 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
|
||||
|
||||
before { SiteSetting.ai_bot_enabled = true }
|
||||
|
||||
it "uses custom researcher_llm and applies token limits correctly" do
|
||||
# Create a second LLM model to test the researcher_llm option
|
||||
secondary_llm_model = Fabricate(:llm_model, name: "secondary_model")
|
||||
|
||||
# Create test content with long text to test token truncation
|
||||
topic = Fabricate(:topic, category: category, tags: [tag_research])
|
||||
long_content = "zz " * 100 # This will exceed our token limit
|
||||
_test_post =
|
||||
Fabricate(:post, topic: topic, raw: long_content, user: user, skip_validation: true)
|
||||
|
||||
prompts = nil
|
||||
responses = [["Research completed"]]
|
||||
researcher = nil
|
||||
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses(
|
||||
responses,
|
||||
llm: secondary_llm_model,
|
||||
) do |_, _, _prompts|
|
||||
researcher =
|
||||
described_class.new(
|
||||
{ filter: "category:research-category", goals: "analyze test content", dry_run: false },
|
||||
persona_options: {
|
||||
"researcher_llm" => secondary_llm_model.id,
|
||||
"max_tokens_per_post" => 50, # Very small to force truncation
|
||||
"max_tokens_per_batch" => 8000,
|
||||
},
|
||||
bot_user: bot_user,
|
||||
llm: nil,
|
||||
context: DiscourseAi::Personas::BotContext.new(user: user, post: post),
|
||||
)
|
||||
|
||||
results = researcher.invoke(&progress_blk)
|
||||
|
||||
expect(results[:dry_run]).to eq(false)
|
||||
expect(results[:results]).to be_present
|
||||
|
||||
prompts = _prompts
|
||||
end
|
||||
|
||||
expect(prompts).to be_present
|
||||
|
||||
user_message = prompts.first.messages.find { |m| m[:type] == :user }
|
||||
expect(user_message[:content]).to be_present
|
||||
|
||||
# count how many times the the "zz " appears in the content (a bit of token magic, we lose a couple cause we redact)
|
||||
expect(user_message[:content].scan("zz ").count).to eq(48)
|
||||
end
|
||||
|
||||
describe "#invoke" do
|
||||
it "can correctly filter to a topic id" do
|
||||
researcher =
|
||||
@ -104,7 +152,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
|
||||
researcher =
|
||||
described_class.new(
|
||||
{
|
||||
filter: "category:research-category @#{user.username}",
|
||||
filter: "category:research-category username:#{user.username}",
|
||||
goals: "find relevant content",
|
||||
dry_run: false,
|
||||
},
|
||||
@ -129,7 +177,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
|
||||
|
||||
expect(results[:dry_run]).to eq(false)
|
||||
expect(results[:goals]).to eq("find relevant content")
|
||||
expect(results[:filter]).to eq("category:research-category @#{user.username}")
|
||||
expect(results[:filter]).to eq("category:research-category username:#{user.username}")
|
||||
expect(results[:results].first).to include("Found: Relevant content 1")
|
||||
end
|
||||
end
|
||||
|
@ -144,6 +144,21 @@ describe DiscourseAi::Utils::Research::Filter do
|
||||
end
|
||||
end
|
||||
|
||||
describe "can find posts by users even with unicode usernames" do
|
||||
before { SiteSetting.unicode_usernames = true }
|
||||
let!(:unicode_user) { Fabricate(:user, username: "aאb") }
|
||||
|
||||
it "can filter by unicode usernames" do
|
||||
post = Fabricate(:post, user: unicode_user, topic: feature_topic)
|
||||
filter = described_class.new("username:aאb")
|
||||
expect(filter.search.pluck(:id)).to contain_exactly(post.id)
|
||||
|
||||
filter = described_class.new("usernames:aאb,#{user.username}")
|
||||
posts_ids = Post.where(user_id: [unicode_user.id, user.id]).pluck(:id)
|
||||
expect(filter.search.pluck(:id)).to contain_exactly(*posts_ids)
|
||||
end
|
||||
end
|
||||
|
||||
describe "category filtering" do
|
||||
it "correctly filters posts by categories" do
|
||||
filter = described_class.new("category:Announcements")
|
||||
|
Loading…
x
Reference in New Issue
Block a user