mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-07-01 03:52:34 +00:00
This commit introduces a new Forum Researcher persona specialized in deep forum content analysis along with comprehensive improvements to our AI infrastructure. Key additions: New Forum Researcher persona with advanced filtering and analysis capabilities Robust filtering system supporting tags, categories, dates, users, and keywords LLM formatter to efficiently process and chunk research results Infrastructure improvements: Implemented CancelManager class to centrally manage AI completion cancellations Replaced callback-based cancellation with a more robust pattern Added systematic cancellation monitoring with callbacks Other improvements: Added configurable default_enabled flag to control which personas are enabled by default Updated translation strings for the new researcher functionality Added comprehensive specs for the new components Renames Researcher -> Web Researcher This change makes our AI platform more stable while adding powerful research capabilities that can analyze forum trends and surface relevant content.
75 lines
2.5 KiB
Ruby
75 lines
2.5 KiB
Ruby
# frozen_string_literal: true
|
|
#
|
|
describe DiscourseAi::Utils::Research::LlmFormatter do
|
|
fab!(:user) { Fabricate(:user, username: "test_user") }
|
|
fab!(:topic) { Fabricate(:topic, title: "This is a Test Topic", user: user) }
|
|
fab!(:post) { Fabricate(:post, topic: topic, user: user) }
|
|
let(:tokenizer) { DiscourseAi::Tokenizer::OpenAiTokenizer }
|
|
let(:filter) { DiscourseAi::Utils::Research::Filter.new("@#{user.username}") }
|
|
|
|
describe "#truncate_if_needed" do
|
|
it "returns original content when under token limit" do
|
|
formatter =
|
|
described_class.new(
|
|
filter,
|
|
max_tokens_per_batch: 1000,
|
|
tokenizer: tokenizer,
|
|
max_tokens_per_post: 100,
|
|
)
|
|
|
|
short_text = "This is a short post"
|
|
expect(formatter.send(:truncate_if_needed, short_text)).to eq(short_text)
|
|
end
|
|
|
|
it "truncates content when over token limit" do
|
|
# Create a post with content that will exceed our token limit
|
|
long_text = ("word " * 200).strip
|
|
|
|
formatter =
|
|
described_class.new(
|
|
filter,
|
|
max_tokens_per_batch: 1000,
|
|
tokenizer: tokenizer,
|
|
max_tokens_per_post: 50,
|
|
)
|
|
|
|
truncated = formatter.send(:truncate_if_needed, long_text)
|
|
|
|
expect(truncated).to include("... elided 150 tokens ...")
|
|
expect(truncated).to_not eq(long_text)
|
|
|
|
# Should have roughly 25 words before and 25 after (half of max_tokens_per_post)
|
|
first_chunk = truncated.split("\n\n")[0]
|
|
expect(first_chunk.split(" ").length).to be_within(5).of(25)
|
|
|
|
last_chunk = truncated.split("\n\n")[2]
|
|
expect(last_chunk.split(" ").length).to be_within(5).of(25)
|
|
end
|
|
end
|
|
|
|
describe "#format_post" do
|
|
it "formats posts with truncation for long content" do
|
|
# Set up a post with long content
|
|
long_content = ("word " * 200).strip
|
|
long_post = Fabricate(:post, raw: long_content, topic: topic, user: user)
|
|
|
|
formatter =
|
|
described_class.new(
|
|
filter,
|
|
max_tokens_per_batch: 1000,
|
|
tokenizer: tokenizer,
|
|
max_tokens_per_post: 50,
|
|
)
|
|
|
|
formatted = formatter.send(:format_post, long_post)
|
|
|
|
# Should have standard formatting elements
|
|
expect(formatted).to include("## Post by #{user.username}")
|
|
expect(formatted).to include("Post url: /t/-/#{long_post.topic_id}/#{long_post.post_number}")
|
|
|
|
# Should include truncation marker
|
|
expect(formatted).to include("... elided 150 tokens ...")
|
|
end
|
|
end
|
|
end
|