From 683bb5725bfff386317912e042e3582aa22473d2 Mon Sep 17 00:00:00 2001 From: Natalie Tay Date: Mon, 23 Jun 2025 21:11:20 +0800 Subject: [PATCH] DEV: Split content based on llmmodel's max_output_tokens (#1456) In discourse/discourse-translator#249 we introduced splitting content (post.raw) prior to sending to translation as we were using a sync api. Now that we're streaming thanks to #1424, we'll chunk based on the LlmModel.max_output_tokens. --- app/jobs/regular/detect_translate_post.rb | 2 +- app/jobs/regular/detect_translate_topic.rb | 2 +- app/jobs/regular/localize_categories.rb | 2 +- app/jobs/regular/localize_posts.rb | 2 +- app/jobs/regular/localize_topics.rb | 2 +- lib/translation/base_translator.rb | 36 ++++---- lib/translation/content_splitter.rb | 32 ++++--- lib/translation/post_localizer.rb | 6 +- spec/lib/translation/base_translator_spec.rb | 4 +- spec/lib/translation/content_splitter_spec.rb | 87 +++++++++---------- 10 files changed, 86 insertions(+), 89 deletions(-) diff --git a/app/jobs/regular/detect_translate_post.rb b/app/jobs/regular/detect_translate_post.rb index ea55dd54..ec2381d2 100644 --- a/app/jobs/regular/detect_translate_post.rb +++ b/app/jobs/regular/detect_translate_post.rb @@ -49,7 +49,7 @@ module Jobs # do nothing, there are too many sporadic lookup failures rescue => e DiscourseAi::Translation::VerboseLogger.log( - "Failed to translate post #{post.id} to #{locale}: #{e.message}", + "Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}", ) end end diff --git a/app/jobs/regular/detect_translate_topic.rb b/app/jobs/regular/detect_translate_topic.rb index 8a683605..287d5ed1 100644 --- a/app/jobs/regular/detect_translate_topic.rb +++ b/app/jobs/regular/detect_translate_topic.rb @@ -47,7 +47,7 @@ module Jobs # do nothing, there are too many sporadic lookup failures rescue => e DiscourseAi::Translation::VerboseLogger.log( - "Failed to translate topic #{topic.id} to #{locale}: #{e.message}", + "Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}", ) end end diff --git a/app/jobs/regular/localize_categories.rb b/app/jobs/regular/localize_categories.rb index 9ede9f21..987cd254 100644 --- a/app/jobs/regular/localize_categories.rb +++ b/app/jobs/regular/localize_categories.rb @@ -40,7 +40,7 @@ module Jobs # do nothing, there are too many sporadic lookup failures rescue => e DiscourseAi::Translation::VerboseLogger.log( - "Failed to translate category #{category.id} to #{locale}: #{e.message}", + "Failed to translate category #{category.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}", ) ensure remaining_limit -= 1 diff --git a/app/jobs/regular/localize_posts.rb b/app/jobs/regular/localize_posts.rb index 3dad28e0..50cf4f6c 100644 --- a/app/jobs/regular/localize_posts.rb +++ b/app/jobs/regular/localize_posts.rb @@ -65,7 +65,7 @@ module Jobs # do nothing, there are too many sporadic lookup failures rescue => e DiscourseAi::Translation::VerboseLogger.log( - "Failed to translate post #{post.id} to #{locale}: #{e.message}", + "Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}", ) end end diff --git a/app/jobs/regular/localize_topics.rb b/app/jobs/regular/localize_topics.rb index dceedf98..7428eb1f 100644 --- a/app/jobs/regular/localize_topics.rb +++ b/app/jobs/regular/localize_topics.rb @@ -62,7 +62,7 @@ module Jobs # do nothing, there are too many sporadic lookup failures rescue => e DiscourseAi::Translation::VerboseLogger.log( - "Failed to translate topic #{topic.id} to #{locale}: #{e.message}", + "Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}", ) end end diff --git a/lib/translation/base_translator.rb b/lib/translation/base_translator.rb index c275b843..32fe8db3 100644 --- a/lib/translation/base_translator.rb +++ b/lib/translation/base_translator.rb @@ -15,26 +15,34 @@ module DiscourseAi if (ai_persona = AiPersona.find_by(id: persona_setting)).blank? return nil end - + translation_user = ai_persona.user || Discourse.system_user persona_klass = ai_persona.class_instance persona = persona_klass.new - llm_model = LlmModel.find_by(id: preferred_llm_model(persona_klass)) - return nil if llm_model.blank? + model = LlmModel.find_by(id: preferred_llm_model(persona_klass)) + return nil if model.blank? - bot = - DiscourseAi::Personas::Bot.as( - ai_persona.user || Discourse.system_user, - persona: persona, - model: llm_model, - ) + bot = DiscourseAi::Personas::Bot.as(translation_user, persona:, model:) + ContentSplitter + .split(content: @text, chunk_size: model.max_output_tokens) + .map { |text| get_translation(text:, bot:, translation_user:) } + .join("") + end + + private + + def formatted_content(content) + { content:, target_locale: @target_locale }.to_json + end + + def get_translation(text:, bot:, translation_user:) context = DiscourseAi::Personas::BotContext.new( - user: ai_persona.user || Discourse.system_user, + user: translation_user, skip_tool_details: true, feature_name: "translation", - messages: [{ type: :user, content: formatted_content }], + messages: [{ type: :user, content: formatted_content(text) }], topic: @topic, post: @post, ) @@ -47,12 +55,6 @@ module DiscourseAi structured_output&.read_buffered_property(:translation) end - def formatted_content - { content: @text, target_locale: @target_locale }.to_json - end - - private - def persona_setting raise NotImplementedError end diff --git a/lib/translation/content_splitter.rb b/lib/translation/content_splitter.rb index 4ce1cc97..1885317e 100644 --- a/lib/translation/content_splitter.rb +++ b/lib/translation/content_splitter.rb @@ -3,7 +3,7 @@ module DiscourseAi module Translation class ContentSplitter - CHUNK_SIZE = 3000 + DEFAULT_CHUNK_SIZE = 8192 BBCODE_PATTERNS = [ %r{\[table.*?\].*?\[/table\]}m, @@ -23,16 +23,17 @@ module DiscourseAi /\s+/, # any whitespace ].freeze - def self.split(content) + def self.split(content:, chunk_size: DEFAULT_CHUNK_SIZE) return [] if content.nil? return [""] if content.empty? - return [content] if content.length <= CHUNK_SIZE + chunk_size ||= DEFAULT_CHUNK_SIZE + return [content] if content.length <= chunk_size chunks = [] remaining = content.dup while remaining.present? - chunk = extract_mixed_chunk(remaining) + chunk = extract_mixed_chunk(remaining, size: chunk_size) break if chunk.empty? chunks << chunk remaining = remaining[chunk.length..-1] @@ -43,9 +44,8 @@ module DiscourseAi private - def self.extract_mixed_chunk(text, size: CHUNK_SIZE) + def self.extract_mixed_chunk(text, size:) return text if text.length <= size - flexible_size = size * 1.5 # try each splitting strategy in order split_point = @@ -54,7 +54,7 @@ module DiscourseAi -> { find_nearest_bbcode_end_index(text, size) }, -> { find_text_boundary(text, size) }, -> { size }, - ].lazy.map(&:call).compact.find { |pos| pos <= flexible_size } + ].lazy.map(&:call).compact.find { |pos| pos <= size } text[0...split_point] end @@ -64,13 +64,15 @@ module DiscourseAi begin doc = Nokogiri::HTML5.fragment(text) - current_length = 0 + max_length_within_target = 0 doc.children.each do |node| html = node.to_html - end_pos = current_length + html.length - return end_pos if end_pos > target_pos - current_length = end_pos + end_pos = max_length_within_target + html.length + if (max_length_within_target > 0 && end_pos > target_pos) + return max_length_within_target + end + max_length_within_target = end_pos end nil rescue Nokogiri::SyntaxError @@ -79,13 +81,15 @@ module DiscourseAi end def self.find_nearest_bbcode_end_index(text, target_pos) + max_length_within_target = 0 BBCODE_PATTERNS.each do |pattern| text.scan(pattern) do |_| match = $~ - tag_start = match.begin(0) tag_end = match.end(0) - - return tag_end if tag_start <= target_pos && tag_end > target_pos + if (max_length_within_target > 0 && tag_end > target_pos) + return max_length_within_target + end + max_length_within_target = tag_end end end diff --git a/lib/translation/post_localizer.rb b/lib/translation/post_localizer.rb index 6f75ea1c..0ca76d2b 100644 --- a/lib/translation/post_localizer.rb +++ b/lib/translation/post_localizer.rb @@ -11,11 +11,7 @@ module DiscourseAi return if post.raw.length > SiteSetting.ai_translation_max_post_length target_locale = target_locale.to_s.sub("-", "_") - translated_raw = - ContentSplitter - .split(post.raw) - .map { |text| PostRawTranslator.new(text:, target_locale:, post:).translate } - .join("") + translated_raw = PostRawTranslator.new(text: post.raw, target_locale:, post:).translate localization = PostLocalization.find_or_initialize_by(post_id: post.id, locale: target_locale) diff --git a/spec/lib/translation/base_translator_spec.rb b/spec/lib/translation/base_translator_spec.rb index 88e0aa8e..729b2e0c 100644 --- a/spec/lib/translation/base_translator_spec.rb +++ b/spec/lib/translation/base_translator_spec.rb @@ -1,7 +1,5 @@ # frozen_string_literal: true -require "rails_helper" - describe DiscourseAi::Translation::BaseTranslator do let!(:persona) do AiPersona.find( @@ -28,7 +26,7 @@ describe DiscourseAi::Translation::BaseTranslator do DiscourseAi::Translation::PostRawTranslator.new(text:, target_locale:, post:) allow(DiscourseAi::Completions::Prompt).to receive(:new).with( persona.system_prompt, - messages: array_including({ type: :user, content: post_translator.formatted_content }), + messages: array_including({ type: :user, content: a_string_including(text) }), post_id: post.id, topic_id: post.topic_id, ).and_call_original diff --git a/spec/lib/translation/content_splitter_spec.rb b/spec/lib/translation/content_splitter_spec.rb index 75847608..b4c77dee 100644 --- a/spec/lib/translation/content_splitter_spec.rb +++ b/spec/lib/translation/content_splitter_spec.rb @@ -1,95 +1,92 @@ # frozen_string_literal: true describe DiscourseAi::Translation::ContentSplitter do - let(:original_limit) { 4000 } - - after { described_class.const_set(:CHUNK_SIZE, original_limit) } - - def set_limit(value) - described_class.const_set(:CHUNK_SIZE, value) - end - it "returns empty array for empty input" do - expect(described_class.split("")).to eq([""]) + expect(described_class.split(content: "")).to eq([""]) end it "handles content with only spaces" do - expect(described_class.split(" ")).to eq([" "]) - expect(described_class.split(" ")).to eq([" "]) + expect(described_class.split(content: " ")).to eq([" "]) + expect(described_class.split(content: " ")).to eq([" "]) end it "handles nil input" do - expect(described_class.split(nil)).to eq([]) + expect(described_class.split(content: nil)).to eq([]) end it "doesn't split content under limit" do - text = "hello world" - expect(described_class.split(text)).to eq([text]) + content = "hello world" + expect(described_class.split(content:, chunk_size: 20)).to eq([content]) + end + + it "splits to max chunk size if unsplittable" do + content = "a" * 100 + expect(described_class.split(content:, chunk_size: 10)).to eq(["a" * 10] * 10) end it "preserves HTML tags" do - set_limit(10) - text = "

hello

meow

" - expect(described_class.split(text)).to eq(%w[

hello

meow

]) + content = "

hello

meow

" + expect(described_class.split(content:, chunk_size: 15)).to eq(%w[

hello

meow

]) - set_limit(35) - text = "
hello
jurassic

world

" - expect(described_class.split(text)).to eq( - ["
hello
jurassic
", "

world

"], + content = "
hello
jurassic

world

" + expect(described_class.split(content:, chunk_size: 40)).to eq( + ["
hello
jurassic
", "

world

"], ) end it "preserves BBCode tags" do - set_limit(20) - text = "[quote]hello[/quote][details]world[/details]" - expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"]) + content = "[quote]hello[/quote][details]world[/details]" + expect(described_class.split(content:, chunk_size: 25)).to eq( + ["[quote]hello[/quote]", "[details]world[/details]"], + ) end it "doesn't split in middle of words" do - set_limit(10) - text = "my kitty best in the world" - expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"]) + content = "my kitty best in the world" + expect(described_class.split(content:, chunk_size: 10)).to eq( + ["my kitty ", "best in ", "the world"], + ) end it "handles nested tags properly" do - set_limit(25) - text = "
hello

cat

world

meow

" - expect(described_class.split(text)).to eq(%w[
hello

cat

world

meow

]) + content = "
hello

cat

world

meow

" + expect(described_class.split(content:, chunk_size: 35)).to eq( + %w[
hello

cat

world

meow

], + ) end it "handles mixed HTML and BBCode" do - set_limit(15) - text = "
hello
[quote]world[/quote]

beautiful

" - expect(described_class.split(text)).to eq( + content = "
hello
[quote]world[/quote]

beautiful

" + expect(described_class.split(content:, chunk_size: 20)).to eq( ["
hello
", "[quote]world[/quote]", "

beautiful

"], ) end it "preserves newlines in sensible places" do - set_limit(10) - text = "hello\nbeautiful\nworld\n" - expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"]) + content = "hello\nbeautiful\nworld\n" + expect(described_class.split(content:, chunk_size: 10)).to eq( + ["hello\n", "beautiful\n", "world\n"], + ) end it "handles email content properly" do - set_limit(20) - text = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here" - expect(described_class.split(text)).to eq( + content = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here" + expect(described_class.split(content:, chunk_size: 20)).to eq( ["From: test@test.com\n", "To: other@test.com\n", "Subject: Hello\n\n", "Content here"], ) end it "keeps code blocks intact" do - set_limit(30) - text = "Text\n```\ncode block\nhere\n```\nmore text" - expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"]) + content = "Text\n```\ncode block\nhere\n```\nmore text" + expect(described_class.split(content:, chunk_size: 30)).to eq( + ["Text\n```\ncode block\nhere\n```\n", "more text"], + ) end context "with multiple details tags" do it "splits correctly between details tags" do - set_limit(30) - text = "
first content
second content
" - expect(described_class.split(text)).to eq( + content = "
first content
second content
" + expect(described_class.split(content:, chunk_size: 35)).to eq( ["
first content
", "
second content
"], ) end