mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-11-13 04:39:22 +00:00
This commit introduces a new Forum Researcher persona specialized in deep forum content analysis along with comprehensive improvements to our AI infrastructure.
Key additions:
New Forum Researcher persona with advanced filtering and analysis capabilities
Robust filtering system supporting tags, categories, dates, users, and keywords
LLM formatter to efficiently process and chunk research results
Infrastructure improvements:
Implemented CancelManager class to centrally manage AI completion cancellations
Replaced callback-based cancellation with a more robust pattern
Added systematic cancellation monitoring with callbacks
Other improvements:
Added configurable default_enabled flag to control which personas are enabled by default
Updated translation strings for the new researcher functionality
Added comprehensive specs for the new components
Renames Researcher -> Web Researcher
This change makes our AI platform more stable while adding powerful research capabilities that can analyze forum trends and surface relevant content.
75 lines
2.5 KiB
Ruby
75 lines
2.5 KiB
Ruby
# frozen_string_literal: true
|
|
#
|
|
describe DiscourseAi::Utils::Research::LlmFormatter do
|
|
fab!(:user) { Fabricate(:user, username: "test_user") }
|
|
fab!(:topic) { Fabricate(:topic, title: "This is a Test Topic", user: user) }
|
|
fab!(:post) { Fabricate(:post, topic: topic, user: user) }
|
|
let(:tokenizer) { DiscourseAi::Tokenizer::OpenAiTokenizer }
|
|
let(:filter) { DiscourseAi::Utils::Research::Filter.new("@#{user.username}") }
|
|
|
|
describe "#truncate_if_needed" do
|
|
it "returns original content when under token limit" do
|
|
formatter =
|
|
described_class.new(
|
|
filter,
|
|
max_tokens_per_batch: 1000,
|
|
tokenizer: tokenizer,
|
|
max_tokens_per_post: 100,
|
|
)
|
|
|
|
short_text = "This is a short post"
|
|
expect(formatter.send(:truncate_if_needed, short_text)).to eq(short_text)
|
|
end
|
|
|
|
it "truncates content when over token limit" do
|
|
# Create a post with content that will exceed our token limit
|
|
long_text = ("word " * 200).strip
|
|
|
|
formatter =
|
|
described_class.new(
|
|
filter,
|
|
max_tokens_per_batch: 1000,
|
|
tokenizer: tokenizer,
|
|
max_tokens_per_post: 50,
|
|
)
|
|
|
|
truncated = formatter.send(:truncate_if_needed, long_text)
|
|
|
|
expect(truncated).to include("... elided 150 tokens ...")
|
|
expect(truncated).to_not eq(long_text)
|
|
|
|
# Should have roughly 25 words before and 25 after (half of max_tokens_per_post)
|
|
first_chunk = truncated.split("\n\n")[0]
|
|
expect(first_chunk.split(" ").length).to be_within(5).of(25)
|
|
|
|
last_chunk = truncated.split("\n\n")[2]
|
|
expect(last_chunk.split(" ").length).to be_within(5).of(25)
|
|
end
|
|
end
|
|
|
|
describe "#format_post" do
|
|
it "formats posts with truncation for long content" do
|
|
# Set up a post with long content
|
|
long_content = ("word " * 200).strip
|
|
long_post = Fabricate(:post, raw: long_content, topic: topic, user: user)
|
|
|
|
formatter =
|
|
described_class.new(
|
|
filter,
|
|
max_tokens_per_batch: 1000,
|
|
tokenizer: tokenizer,
|
|
max_tokens_per_post: 50,
|
|
)
|
|
|
|
formatted = formatter.send(:format_post, long_post)
|
|
|
|
# Should have standard formatting elements
|
|
expect(formatted).to include("## Post by #{user.username}")
|
|
expect(formatted).to include("Post url: /t/-/#{long_post.topic_id}/#{long_post.post_number}")
|
|
|
|
# Should include truncation marker
|
|
expect(formatted).to include("... elided 150 tokens ...")
|
|
end
|
|
end
|
|
end
|