FEATURE: add context and llm controls to researcher, fix username filter (#1401)

Adds context length controls to researcher (max tokens per post and batch) Allow picking LLM for researcher Fix bug where unicode usernames were not working Fix documentation of OR logic
2025-07-14 01:53:27 +00:00 · 2025-06-04 16:39:43 +10:00 · 2025-06-04 16:39:43 +10:00 · 3e74eea1e5
commit 3e74eea1e5
parent 4f980d5514
5 changed files with 129 additions and 30 deletions
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -344,6 +344,15 @@ en:
      searching: "Searching for: '%{query}'"
      tool_options:
        researcher:
+          researcher_llm:
+            name: "LLM"
+            description: "Language model to use for research (default to current persona's LLM)"
+          max_tokens_per_batch:
+            name: "Maximum tokens per batch"
+            description: "Maximum number of tokens to use for each batch in the research"
+          max_tokens_per_post:
+            name: "Maximum tokens per post"
+            description: "Maximum number of tokens to use for each post in the research"
          max_results:
            name: "Maximum number of results"
            description: "Maximum number of results to include in a filter"
--- a/lib/personas/tools/researcher.rb
+++ b/lib/personas/tools/researcher.rb
@ -31,26 +31,28 @@ module DiscourseAi

          def filter_description
            <<~TEXT
-              Filter string to target specific content.
-              - Supports user (@username)
-              - post_type:first - only includes first posts in topics
-              - post_type:reply - only replies in topics
-              - date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
-              - categories (category:category1,category2 or categories:category1,category2)
-              - tags (tag:tag1,tag2 or tags:tag1,tag2)
-              - groups (group:group1,group2 or groups:group1,group2)
-              - status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
-              - keywords (keywords:keyword1,keyword2) - searches for specific words within post content using full-text search
-              - topic_keywords (topic_keywords:keyword1,keyword2) - searches for keywords within topics, returns all posts from matching topics
-              - topics (topic:topic_id1,topic_id2 or topics:topic_id1,topic_id2) - target specific topics by ID
-              - max_results (max_results:10) - limits the maximum number of results returned (optional)
-              - order (order:latest, order:oldest, order:latest_topic, order:oldest_topic, order:likes) - controls result ordering (optional, defaults to latest posts)
+              Filter string to target specific content. Space-separated filters use AND logic, OR creates separate filter groups.

-              Multiple filters can be combined with spaces for AND logic. Example: '@sam after:2023-01-01 tag:feature'
+              **Filters:**
+              - username:user1 or usernames:user1,user2 - posts by specific users
+              - group:group1 or groups:group1,group2 - posts by users in specific groups
+              - post_type:first|reply - first posts only or replies only
+              - keywords:word1,word2 - full-text search in post content
+              - topic_keywords:word1,word2 - full-text search in topics (returns all posts from matching topics)
+              - topic:123 or topics:123,456 - specific topics by ID
+              - category:name1 or categories:name1,name2 - posts in categories (by name/slug)
+              - tag:tag1 or tags:tag1,tag2 - posts in topics with tags
+              - after:YYYY-MM-DD, before:YYYY-MM-DD - filter by post creation date
+              - topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD - filter by topic creation date
+              - status:open|closed|archived|noreplies|single_user - topic status filters
+              - max_results:N - limit results (per OR group)
+              - order:latest|oldest|latest_topic|oldest_topic|likes - sort order

-              Use OR to combine filter segments for inclusive logic.
-              Example: 'category:feature,bug OR tag:feature-tag' - includes posts in feature OR bug categories, OR posts with feature-tag tag
-              Example: '@sam category:bug' - includes posts by @sam AND in bug category
+              **OR Logic:** Each OR group processes independently - filters don't cross boundaries.
+
+              Examples:
+              - 'username:sam after:2023-01-01' - sam's posts after date
+              - 'max_results:50 category:bugs OR tag:urgent' - (≤50 bug posts) OR (all urgent posts)
            TEXT
          end

@ -60,9 +62,11 @@ module DiscourseAi

          def accepted_options
            [
+              option(:researcher_llm, type: :llm),
              option(:max_results, type: :integer),
              option(:include_private, type: :boolean),
              option(:max_tokens_per_post, type: :integer),
+              option(:max_tokens_per_batch, type: :integer),
            ]
          end
        end
@ -134,17 +138,32 @@ module DiscourseAi
        protected

        MIN_TOKENS_FOR_RESEARCH = 8000
+        MIN_TOKENS_FOR_POST = 50
+
        def process_filter(filter, goals, post, &blk)
-          if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
+          if researcher_llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
            raise ArgumentError,
                  "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
          end
+
+          max_tokens_per_batch = options[:max_tokens_per_batch].to_i
+          if max_tokens_per_batch <= MIN_TOKENS_FOR_RESEARCH
+            max_tokens_per_batch = researcher_llm.max_prompt_tokens - 2000
+          end
+
+          max_tokens_per_post = options[:max_tokens_per_post]
+          if max_tokens_per_post.nil?
+            max_tokens_per_post = 2000
+          elsif max_tokens_per_post < MIN_TOKENS_FOR_POST
+            max_tokens_per_post = MIN_TOKENS_FOR_POST
+          end
+
          formatter =
            DiscourseAi::Utils::Research::LlmFormatter.new(
              filter,
-              max_tokens_per_batch: llm.max_prompt_tokens - 2000,
-              tokenizer: llm.tokenizer,
-              max_tokens_per_post: options[:max_tokens_per_post] || 2000,
+              max_tokens_per_batch: max_tokens_per_batch,
+              tokenizer: researcher_llm.tokenizer,
+              max_tokens_per_post: max_tokens_per_post,
            )

          results = []
@ -164,6 +183,14 @@ module DiscourseAi
          end
        end

+        def researcher_llm
+          @researcher_llm ||=
+            (
+              options[:researcher_llm].present? &&
+                LlmModel.find_by(id: options[:researcher_llm].to_i)&.to_llm
+            ) || self.llm
+        end
+
        def run_inference(chunk_text, goals, post, &blk)
          return if context.cancel_manager&.cancelled?

@ -179,7 +206,7 @@ module DiscourseAi
            )

          results = []
-          llm.generate(
+          researcher_llm.generate(
            prompt,
            user: post.user,
            feature_name: context.feature_name,
--- a/lib/utils/research/filter.rb
+++ b/lib/utils/research/filter.rb
@ -153,12 +153,12 @@ module DiscourseAi
          end
        end

-        register_filter(/\A\@(\w+)\z/i) do |relation, username, filter|
-          user = User.find_by(username_lower: username.downcase)
-          if user
-            relation.where("posts.user_id = ?", user.id)
+        register_filter(/\Ausernames?:(.+)\z/i) do |relation, username, filter|
+          user_ids = User.where(username_lower: username.split(",").map(&:downcase)).pluck(:id)
+          if user_ids.empty?
+            relation.where("1 = 0")
          else
-            relation.where("1 = 0") # No results if user doesn't exist
+            relation.where("posts.user_id IN (?)", user_ids)
          end
        end

--- a/spec/lib/personas/tools/researcher_spec.rb
+++ b/spec/lib/personas/tools/researcher_spec.rb
@ -21,6 +21,54 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do

  before { SiteSetting.ai_bot_enabled = true }

+  it "uses custom researcher_llm and applies token limits correctly" do
+    # Create a second LLM model to test the researcher_llm option
+    secondary_llm_model = Fabricate(:llm_model, name: "secondary_model")
+
+    # Create test content with long text to test token truncation
+    topic = Fabricate(:topic, category: category, tags: [tag_research])
+    long_content = "zz " * 100 # This will exceed our token limit
+    _test_post =
+      Fabricate(:post, topic: topic, raw: long_content, user: user, skip_validation: true)
+
+    prompts = nil
+    responses = [["Research completed"]]
+    researcher = nil
+
+    DiscourseAi::Completions::Llm.with_prepared_responses(
+      responses,
+      llm: secondary_llm_model,
+    ) do |_, _, _prompts|
+      researcher =
+        described_class.new(
+          { filter: "category:research-category", goals: "analyze test content", dry_run: false },
+          persona_options: {
+            "researcher_llm" => secondary_llm_model.id,
+            "max_tokens_per_post" => 50, # Very small to force truncation
+            "max_tokens_per_batch" => 8000,
+          },
+          bot_user: bot_user,
+          llm: nil,
+          context: DiscourseAi::Personas::BotContext.new(user: user, post: post),
+        )
+
+      results = researcher.invoke(&progress_blk)
+
+      expect(results[:dry_run]).to eq(false)
+      expect(results[:results]).to be_present
+
+      prompts = _prompts
+    end
+
+    expect(prompts).to be_present
+
+    user_message = prompts.first.messages.find { |m| m[:type] == :user }
+    expect(user_message[:content]).to be_present
+
+    # count how many times the the "zz " appears in the content (a bit of token magic, we lose a couple cause we redact)
+    expect(user_message[:content].scan("zz ").count).to eq(48)
+  end
+
  describe "#invoke" do
    it "can correctly filter to a topic id" do
      researcher =
@ -104,7 +152,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do
      researcher =
        described_class.new(
          {
-            filter: "category:research-category @#{user.username}",
+            filter: "category:research-category username:#{user.username}",
            goals: "find relevant content",
            dry_run: false,
          },
@ -129,7 +177,7 @@ RSpec.describe DiscourseAi::Personas::Tools::Researcher do

      expect(results[:dry_run]).to eq(false)
      expect(results[:goals]).to eq("find relevant content")
-      expect(results[:filter]).to eq("category:research-category @#{user.username}")
+      expect(results[:filter]).to eq("category:research-category username:#{user.username}")
      expect(results[:results].first).to include("Found: Relevant content 1")
    end
  end
--- a/spec/lib/utils/research/filter_spec.rb
+++ b/spec/lib/utils/research/filter_spec.rb
@ -144,6 +144,21 @@ describe DiscourseAi::Utils::Research::Filter do
      end
    end

+    describe "can find posts by users even with unicode usernames" do
+      before { SiteSetting.unicode_usernames = true }
+      let!(:unicode_user) { Fabricate(:user, username: "aאb") }
+
+      it "can filter by unicode usernames" do
+        post = Fabricate(:post, user: unicode_user, topic: feature_topic)
+        filter = described_class.new("username:aאb")
+        expect(filter.search.pluck(:id)).to contain_exactly(post.id)
+
+        filter = described_class.new("usernames:aאb,#{user.username}")
+        posts_ids = Post.where(user_id: [unicode_user.id, user.id]).pluck(:id)
+        expect(filter.search.pluck(:id)).to contain_exactly(*posts_ids)
+      end
+    end
+
    describe "category filtering" do
      it "correctly filters posts by categories" do
        filter = described_class.new("category:Announcements")