From 683bb5725bfff386317912e042e3582aa22473d2 Mon Sep 17 00:00:00 2001
From: Natalie Tay <natalie.tay@gmail.com>
Date: Mon, 23 Jun 2025 21:11:20 +0800
Subject: [PATCH] DEV: Split content based on llmmodel's max_output_tokens
 (#1456)

In discourse/discourse-translator#249 we introduced splitting content (post.raw) prior to sending to translation as we were using a sync api.

Now that we're streaming thanks to #1424, we'll chunk based on the LlmModel.max_output_tokens.
---
 app/jobs/regular/detect_translate_post.rb     |  2 +-
 app/jobs/regular/detect_translate_topic.rb    |  2 +-
 app/jobs/regular/localize_categories.rb       |  2 +-
 app/jobs/regular/localize_posts.rb            |  2 +-
 app/jobs/regular/localize_topics.rb           |  2 +-
 lib/translation/base_translator.rb            | 36 ++++----
 lib/translation/content_splitter.rb           | 32 ++++---
 lib/translation/post_localizer.rb             |  6 +-
 spec/lib/translation/base_translator_spec.rb  |  4 +-
 spec/lib/translation/content_splitter_spec.rb | 87 +++++++++----------
 10 files changed, 86 insertions(+), 89 deletions(-)

diff --git a/app/jobs/regular/detect_translate_post.rb b/app/jobs/regular/detect_translate_post.rb
index ea55dd54..ec2381d2 100644
--- a/app/jobs/regular/detect_translate_post.rb
+++ b/app/jobs/regular/detect_translate_post.rb
@@ -49,7 +49,7 @@ module Jobs
           # do nothing, there are too many sporadic lookup failures
         rescue => e
           DiscourseAi::Translation::VerboseLogger.log(
-            "Failed to translate post #{post.id} to #{locale}: #{e.message}",
+            "Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
           )
         end
       end
diff --git a/app/jobs/regular/detect_translate_topic.rb b/app/jobs/regular/detect_translate_topic.rb
index 8a683605..287d5ed1 100644
--- a/app/jobs/regular/detect_translate_topic.rb
+++ b/app/jobs/regular/detect_translate_topic.rb
@@ -47,7 +47,7 @@ module Jobs
           # do nothing, there are too many sporadic lookup failures
         rescue => e
           DiscourseAi::Translation::VerboseLogger.log(
-            "Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
+            "Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
           )
         end
       end
diff --git a/app/jobs/regular/localize_categories.rb b/app/jobs/regular/localize_categories.rb
index 9ede9f21..987cd254 100644
--- a/app/jobs/regular/localize_categories.rb
+++ b/app/jobs/regular/localize_categories.rb
@@ -40,7 +40,7 @@ module Jobs
             # do nothing, there are too many sporadic lookup failures
           rescue => e
             DiscourseAi::Translation::VerboseLogger.log(
-              "Failed to translate category #{category.id} to #{locale}: #{e.message}",
+              "Failed to translate category #{category.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
             )
           ensure
             remaining_limit -= 1
diff --git a/app/jobs/regular/localize_posts.rb b/app/jobs/regular/localize_posts.rb
index 3dad28e0..50cf4f6c 100644
--- a/app/jobs/regular/localize_posts.rb
+++ b/app/jobs/regular/localize_posts.rb
@@ -65,7 +65,7 @@ module Jobs
             # do nothing, there are too many sporadic lookup failures
           rescue => e
             DiscourseAi::Translation::VerboseLogger.log(
-              "Failed to translate post #{post.id} to #{locale}: #{e.message}",
+              "Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
             )
           end
         end
diff --git a/app/jobs/regular/localize_topics.rb b/app/jobs/regular/localize_topics.rb
index dceedf98..7428eb1f 100644
--- a/app/jobs/regular/localize_topics.rb
+++ b/app/jobs/regular/localize_topics.rb
@@ -62,7 +62,7 @@ module Jobs
             # do nothing, there are too many sporadic lookup failures
           rescue => e
             DiscourseAi::Translation::VerboseLogger.log(
-              "Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
+              "Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
             )
           end
         end
diff --git a/lib/translation/base_translator.rb b/lib/translation/base_translator.rb
index c275b843..32fe8db3 100644
--- a/lib/translation/base_translator.rb
+++ b/lib/translation/base_translator.rb
@@ -15,26 +15,34 @@ module DiscourseAi
         if (ai_persona = AiPersona.find_by(id: persona_setting)).blank?
           return nil
         end
-
+        translation_user = ai_persona.user || Discourse.system_user
         persona_klass = ai_persona.class_instance
         persona = persona_klass.new
 
-        llm_model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
-        return nil if llm_model.blank?
+        model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
+        return nil if model.blank?
 
-        bot =
-          DiscourseAi::Personas::Bot.as(
-            ai_persona.user || Discourse.system_user,
-            persona: persona,
-            model: llm_model,
-          )
+        bot = DiscourseAi::Personas::Bot.as(translation_user, persona:, model:)
 
+        ContentSplitter
+          .split(content: @text, chunk_size: model.max_output_tokens)
+          .map { |text| get_translation(text:, bot:, translation_user:) }
+          .join("")
+      end
+
+      private
+
+      def formatted_content(content)
+        { content:, target_locale: @target_locale }.to_json
+      end
+
+      def get_translation(text:, bot:, translation_user:)
         context =
           DiscourseAi::Personas::BotContext.new(
-            user: ai_persona.user || Discourse.system_user,
+            user: translation_user,
             skip_tool_details: true,
             feature_name: "translation",
-            messages: [{ type: :user, content: formatted_content }],
+            messages: [{ type: :user, content: formatted_content(text) }],
             topic: @topic,
             post: @post,
           )
@@ -47,12 +55,6 @@ module DiscourseAi
         structured_output&.read_buffered_property(:translation)
       end
 
-      def formatted_content
-        { content: @text, target_locale: @target_locale }.to_json
-      end
-
-      private
-
       def persona_setting
         raise NotImplementedError
       end
diff --git a/lib/translation/content_splitter.rb b/lib/translation/content_splitter.rb
index 4ce1cc97..1885317e 100644
--- a/lib/translation/content_splitter.rb
+++ b/lib/translation/content_splitter.rb
@@ -3,7 +3,7 @@
 module DiscourseAi
   module Translation
     class ContentSplitter
-      CHUNK_SIZE = 3000
+      DEFAULT_CHUNK_SIZE = 8192
 
       BBCODE_PATTERNS = [
         %r{\[table.*?\].*?\[/table\]}m,
@@ -23,16 +23,17 @@ module DiscourseAi
         /\s+/, # any whitespace
       ].freeze
 
-      def self.split(content)
+      def self.split(content:, chunk_size: DEFAULT_CHUNK_SIZE)
         return [] if content.nil?
         return [""] if content.empty?
-        return [content] if content.length <= CHUNK_SIZE
+        chunk_size ||= DEFAULT_CHUNK_SIZE
+        return [content] if content.length <= chunk_size
 
         chunks = []
         remaining = content.dup
 
         while remaining.present?
-          chunk = extract_mixed_chunk(remaining)
+          chunk = extract_mixed_chunk(remaining, size: chunk_size)
           break if chunk.empty?
           chunks << chunk
           remaining = remaining[chunk.length..-1]
@@ -43,9 +44,8 @@ module DiscourseAi
 
       private
 
-      def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
+      def self.extract_mixed_chunk(text, size:)
         return text if text.length <= size
-        flexible_size = size * 1.5
 
         # try each splitting strategy in order
         split_point =
@@ -54,7 +54,7 @@ module DiscourseAi
             -> { find_nearest_bbcode_end_index(text, size) },
             -> { find_text_boundary(text, size) },
             -> { size },
-          ].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
+          ].lazy.map(&:call).compact.find { |pos| pos <= size }
 
         text[0...split_point]
       end
@@ -64,13 +64,15 @@ module DiscourseAi
 
         begin
           doc = Nokogiri::HTML5.fragment(text)
-          current_length = 0
+          max_length_within_target = 0
 
           doc.children.each do |node|
             html = node.to_html
-            end_pos = current_length + html.length
-            return end_pos if end_pos > target_pos
-            current_length = end_pos
+            end_pos = max_length_within_target + html.length
+            if (max_length_within_target > 0 && end_pos > target_pos)
+              return max_length_within_target
+            end
+            max_length_within_target = end_pos
           end
           nil
         rescue Nokogiri::SyntaxError
@@ -79,13 +81,15 @@ module DiscourseAi
       end
 
       def self.find_nearest_bbcode_end_index(text, target_pos)
+        max_length_within_target = 0
         BBCODE_PATTERNS.each do |pattern|
           text.scan(pattern) do |_|
             match = $~
-            tag_start = match.begin(0)
             tag_end = match.end(0)
-
-            return tag_end if tag_start <= target_pos && tag_end > target_pos
+            if (max_length_within_target > 0 && tag_end > target_pos)
+              return max_length_within_target
+            end
+            max_length_within_target = tag_end
           end
         end
 
diff --git a/lib/translation/post_localizer.rb b/lib/translation/post_localizer.rb
index 6f75ea1c..0ca76d2b 100644
--- a/lib/translation/post_localizer.rb
+++ b/lib/translation/post_localizer.rb
@@ -11,11 +11,7 @@ module DiscourseAi
         return if post.raw.length > SiteSetting.ai_translation_max_post_length
         target_locale = target_locale.to_s.sub("-", "_")
 
-        translated_raw =
-          ContentSplitter
-            .split(post.raw)
-            .map { |text| PostRawTranslator.new(text:, target_locale:, post:).translate }
-            .join("")
+        translated_raw = PostRawTranslator.new(text: post.raw, target_locale:, post:).translate
 
         localization =
           PostLocalization.find_or_initialize_by(post_id: post.id, locale: target_locale)
diff --git a/spec/lib/translation/base_translator_spec.rb b/spec/lib/translation/base_translator_spec.rb
index 88e0aa8e..729b2e0c 100644
--- a/spec/lib/translation/base_translator_spec.rb
+++ b/spec/lib/translation/base_translator_spec.rb
@@ -1,7 +1,5 @@
 # frozen_string_literal: true
 
-require "rails_helper"
-
 describe DiscourseAi::Translation::BaseTranslator do
   let!(:persona) do
     AiPersona.find(
@@ -28,7 +26,7 @@ describe DiscourseAi::Translation::BaseTranslator do
         DiscourseAi::Translation::PostRawTranslator.new(text:, target_locale:, post:)
       allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
         persona.system_prompt,
-        messages: array_including({ type: :user, content: post_translator.formatted_content }),
+        messages: array_including({ type: :user, content: a_string_including(text) }),
         post_id: post.id,
         topic_id: post.topic_id,
       ).and_call_original
diff --git a/spec/lib/translation/content_splitter_spec.rb b/spec/lib/translation/content_splitter_spec.rb
index 75847608..b4c77dee 100644
--- a/spec/lib/translation/content_splitter_spec.rb
+++ b/spec/lib/translation/content_splitter_spec.rb
@@ -1,95 +1,92 @@
 # frozen_string_literal: true
 
 describe DiscourseAi::Translation::ContentSplitter do
-  let(:original_limit) { 4000 }
-
-  after { described_class.const_set(:CHUNK_SIZE, original_limit) }
-
-  def set_limit(value)
-    described_class.const_set(:CHUNK_SIZE, value)
-  end
-
   it "returns empty array for empty input" do
-    expect(described_class.split("")).to eq([""])
+    expect(described_class.split(content: "")).to eq([""])
   end
 
   it "handles content with only spaces" do
-    expect(described_class.split(" ")).to eq([" "])
-    expect(described_class.split("  ")).to eq(["  "])
+    expect(described_class.split(content: " ")).to eq([" "])
+    expect(described_class.split(content: "  ")).to eq(["  "])
   end
 
   it "handles nil input" do
-    expect(described_class.split(nil)).to eq([])
+    expect(described_class.split(content: nil)).to eq([])
   end
 
   it "doesn't split content under limit" do
-    text = "hello world"
-    expect(described_class.split(text)).to eq([text])
+    content = "hello world"
+    expect(described_class.split(content:, chunk_size: 20)).to eq([content])
+  end
+
+  it "splits to max chunk size if unsplittable" do
+    content = "a" * 100
+    expect(described_class.split(content:, chunk_size: 10)).to eq(["a" * 10] * 10)
   end
 
   it "preserves HTML tags" do
-    set_limit(10)
-    text = "<p>hello</p><p>meow</p>"
-    expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
+    content = "<p>hello</p><p>meow</p>"
+    expect(described_class.split(content:, chunk_size: 15)).to eq(%w[<p>hello</p> <p>meow</p>])
 
-    set_limit(35)
-    text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
-    expect(described_class.split(text)).to eq(
-      ["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
+    content = "<div>hello</div> <div>jurassic</div> <p>world</p>"
+    expect(described_class.split(content:, chunk_size: 40)).to eq(
+      ["<div>hello</div> <div>jurassic</div> ", "<p>world</p>"],
     )
   end
 
   it "preserves BBCode tags" do
-    set_limit(20)
-    text = "[quote]hello[/quote][details]world[/details]"
-    expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
+    content = "[quote]hello[/quote][details]world[/details]"
+    expect(described_class.split(content:, chunk_size: 25)).to eq(
+      ["[quote]hello[/quote]", "[details]world[/details]"],
+    )
   end
 
   it "doesn't split in middle of words" do
-    set_limit(10)
-    text = "my kitty best in the world"
-    expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
+    content = "my kitty best in the world"
+    expect(described_class.split(content:, chunk_size: 10)).to eq(
+      ["my kitty ", "best in ", "the world"],
+    )
   end
 
   it "handles nested tags properly" do
-    set_limit(25)
-    text = "<div>hello<p>cat</p>world</div><p>meow</p>"
-    expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
+    content = "<div>hello<p>cat</p>world</div><p>meow</p>"
+    expect(described_class.split(content:, chunk_size: 35)).to eq(
+      %w[<div>hello<p>cat</p>world</div> <p>meow</p>],
+    )
   end
 
   it "handles mixed HTML and BBCode" do
-    set_limit(15)
-    text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
-    expect(described_class.split(text)).to eq(
+    content = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
+    expect(described_class.split(content:, chunk_size: 20)).to eq(
       ["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
     )
   end
 
   it "preserves newlines in sensible places" do
-    set_limit(10)
-    text = "hello\nbeautiful\nworld\n"
-    expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
+    content = "hello\nbeautiful\nworld\n"
+    expect(described_class.split(content:, chunk_size: 10)).to eq(
+      ["hello\n", "beautiful\n", "world\n"],
+    )
   end
 
   it "handles email content properly" do
-    set_limit(20)
-    text = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here"
-    expect(described_class.split(text)).to eq(
+    content = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here"
+    expect(described_class.split(content:, chunk_size: 20)).to eq(
       ["From: test@test.com\n", "To: other@test.com\n", "Subject: Hello\n\n", "Content here"],
     )
   end
 
   it "keeps code blocks intact" do
-    set_limit(30)
-    text = "Text\n```\ncode block\nhere\n```\nmore text"
-    expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
+    content = "Text\n```\ncode block\nhere\n```\nmore text"
+    expect(described_class.split(content:, chunk_size: 30)).to eq(
+      ["Text\n```\ncode block\nhere\n```\n", "more text"],
+    )
   end
 
   context "with multiple details tags" do
     it "splits correctly between details tags" do
-      set_limit(30)
-      text = "<details>first content</details><details>second content</details>"
-      expect(described_class.split(text)).to eq(
+      content = "<details>first content</details><details>second content</details>"
+      expect(described_class.split(content:, chunk_size: 35)).to eq(
         ["<details>first content</details>", "<details>second content</details>"],
       )
     end