discourse-ai/spec/lib/utils/research/llm_formatter_spec.rb

# frozen_string_literal: true
#
describe DiscourseAi::Utils::Research::LlmFormatter do
  fab!(:user) { Fabricate(:user, username: "test_user") }
  fab!(:topic) { Fabricate(:topic, title: "This is a Test Topic", user: user) }
  fab!(:post) { Fabricate(:post, topic: topic, user: user) }
  let(:tokenizer) { DiscourseAi::Tokenizer::OpenAiTokenizer }
  let(:filter) { DiscourseAi::Utils::Research::Filter.new("@#{user.username}") }

  describe "#truncate_if_needed" do
    it "returns original content when under token limit" do
      formatter =
        described_class.new(
          filter,
          max_tokens_per_batch: 1000,
          tokenizer: tokenizer,
          max_tokens_per_post: 100,
        )

      short_text = "This is a short post"
      expect(formatter.send(:truncate_if_needed, short_text)).to eq(short_text)
    end

    it "truncates content when over token limit" do
      # Create a post with content that will exceed our token limit
      long_text = ("word " * 200).strip

      formatter =
        described_class.new(
          filter,
          max_tokens_per_batch: 1000,
          tokenizer: tokenizer,
          max_tokens_per_post: 50,
        )

      truncated = formatter.send(:truncate_if_needed, long_text)

      expect(truncated).to include("... elided 150 tokens ...")
      expect(truncated).to_not eq(long_text)

      # Should have roughly 25 words before and 25 after (half of max_tokens_per_post)
      first_chunk = truncated.split("\n\n")[0]
      expect(first_chunk.split(" ").length).to be_within(5).of(25)

      last_chunk = truncated.split("\n\n")[2]
      expect(last_chunk.split(" ").length).to be_within(5).of(25)
    end
  end

  describe "#format_post" do
    it "formats posts with truncation for long content" do
      # Set up a post with long content
      long_content = ("word " * 200).strip
      long_post = Fabricate(:post, raw: long_content, topic: topic, user: user)

      formatter =
        described_class.new(
          filter,
          max_tokens_per_batch: 1000,
          tokenizer: tokenizer,
          max_tokens_per_post: 50,
        )

      formatted = formatter.send(:format_post, long_post)

      # Should have standard formatting elements
      expect(formatted).to include("## Post by #{user.username}")
      expect(formatted).to include("Post url: /t/-/#{long_post.topic_id}/#{long_post.post_number}")

      # Should include truncation marker
      expect(formatted).to include("... elided 150 tokens ...")
    end
  end
end