mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-06-29 19:12:15 +00:00
DEV: Split content based on llmmodel's max_output_tokens (#1456)
In discourse/discourse-translator#249 we introduced splitting content (post.raw) prior to sending to translation as we were using a sync api. Now that we're streaming thanks to #1424, we'll chunk based on the LlmModel.max_output_tokens.
This commit is contained in:
parent
740be26625
commit
683bb5725b
@ -49,7 +49,7 @@ module Jobs
|
||||
# do nothing, there are too many sporadic lookup failures
|
||||
rescue => e
|
||||
DiscourseAi::Translation::VerboseLogger.log(
|
||||
"Failed to translate post #{post.id} to #{locale}: #{e.message}",
|
||||
"Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
|
@ -47,7 +47,7 @@ module Jobs
|
||||
# do nothing, there are too many sporadic lookup failures
|
||||
rescue => e
|
||||
DiscourseAi::Translation::VerboseLogger.log(
|
||||
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
|
||||
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
|
@ -40,7 +40,7 @@ module Jobs
|
||||
# do nothing, there are too many sporadic lookup failures
|
||||
rescue => e
|
||||
DiscourseAi::Translation::VerboseLogger.log(
|
||||
"Failed to translate category #{category.id} to #{locale}: #{e.message}",
|
||||
"Failed to translate category #{category.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
|
||||
)
|
||||
ensure
|
||||
remaining_limit -= 1
|
||||
|
@ -65,7 +65,7 @@ module Jobs
|
||||
# do nothing, there are too many sporadic lookup failures
|
||||
rescue => e
|
||||
DiscourseAi::Translation::VerboseLogger.log(
|
||||
"Failed to translate post #{post.id} to #{locale}: #{e.message}",
|
||||
"Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
|
@ -62,7 +62,7 @@ module Jobs
|
||||
# do nothing, there are too many sporadic lookup failures
|
||||
rescue => e
|
||||
DiscourseAi::Translation::VerboseLogger.log(
|
||||
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
|
||||
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
|
@ -15,26 +15,34 @@ module DiscourseAi
|
||||
if (ai_persona = AiPersona.find_by(id: persona_setting)).blank?
|
||||
return nil
|
||||
end
|
||||
|
||||
translation_user = ai_persona.user || Discourse.system_user
|
||||
persona_klass = ai_persona.class_instance
|
||||
persona = persona_klass.new
|
||||
|
||||
llm_model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
|
||||
return nil if llm_model.blank?
|
||||
model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
|
||||
return nil if model.blank?
|
||||
|
||||
bot =
|
||||
DiscourseAi::Personas::Bot.as(
|
||||
ai_persona.user || Discourse.system_user,
|
||||
persona: persona,
|
||||
model: llm_model,
|
||||
)
|
||||
bot = DiscourseAi::Personas::Bot.as(translation_user, persona:, model:)
|
||||
|
||||
ContentSplitter
|
||||
.split(content: @text, chunk_size: model.max_output_tokens)
|
||||
.map { |text| get_translation(text:, bot:, translation_user:) }
|
||||
.join("")
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def formatted_content(content)
|
||||
{ content:, target_locale: @target_locale }.to_json
|
||||
end
|
||||
|
||||
def get_translation(text:, bot:, translation_user:)
|
||||
context =
|
||||
DiscourseAi::Personas::BotContext.new(
|
||||
user: ai_persona.user || Discourse.system_user,
|
||||
user: translation_user,
|
||||
skip_tool_details: true,
|
||||
feature_name: "translation",
|
||||
messages: [{ type: :user, content: formatted_content }],
|
||||
messages: [{ type: :user, content: formatted_content(text) }],
|
||||
topic: @topic,
|
||||
post: @post,
|
||||
)
|
||||
@ -47,12 +55,6 @@ module DiscourseAi
|
||||
structured_output&.read_buffered_property(:translation)
|
||||
end
|
||||
|
||||
def formatted_content
|
||||
{ content: @text, target_locale: @target_locale }.to_json
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def persona_setting
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
@ -3,7 +3,7 @@
|
||||
module DiscourseAi
|
||||
module Translation
|
||||
class ContentSplitter
|
||||
CHUNK_SIZE = 3000
|
||||
DEFAULT_CHUNK_SIZE = 8192
|
||||
|
||||
BBCODE_PATTERNS = [
|
||||
%r{\[table.*?\].*?\[/table\]}m,
|
||||
@ -23,16 +23,17 @@ module DiscourseAi
|
||||
/\s+/, # any whitespace
|
||||
].freeze
|
||||
|
||||
def self.split(content)
|
||||
def self.split(content:, chunk_size: DEFAULT_CHUNK_SIZE)
|
||||
return [] if content.nil?
|
||||
return [""] if content.empty?
|
||||
return [content] if content.length <= CHUNK_SIZE
|
||||
chunk_size ||= DEFAULT_CHUNK_SIZE
|
||||
return [content] if content.length <= chunk_size
|
||||
|
||||
chunks = []
|
||||
remaining = content.dup
|
||||
|
||||
while remaining.present?
|
||||
chunk = extract_mixed_chunk(remaining)
|
||||
chunk = extract_mixed_chunk(remaining, size: chunk_size)
|
||||
break if chunk.empty?
|
||||
chunks << chunk
|
||||
remaining = remaining[chunk.length..-1]
|
||||
@ -43,9 +44,8 @@ module DiscourseAi
|
||||
|
||||
private
|
||||
|
||||
def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
|
||||
def self.extract_mixed_chunk(text, size:)
|
||||
return text if text.length <= size
|
||||
flexible_size = size * 1.5
|
||||
|
||||
# try each splitting strategy in order
|
||||
split_point =
|
||||
@ -54,7 +54,7 @@ module DiscourseAi
|
||||
-> { find_nearest_bbcode_end_index(text, size) },
|
||||
-> { find_text_boundary(text, size) },
|
||||
-> { size },
|
||||
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
|
||||
].lazy.map(&:call).compact.find { |pos| pos <= size }
|
||||
|
||||
text[0...split_point]
|
||||
end
|
||||
@ -64,13 +64,15 @@ module DiscourseAi
|
||||
|
||||
begin
|
||||
doc = Nokogiri::HTML5.fragment(text)
|
||||
current_length = 0
|
||||
max_length_within_target = 0
|
||||
|
||||
doc.children.each do |node|
|
||||
html = node.to_html
|
||||
end_pos = current_length + html.length
|
||||
return end_pos if end_pos > target_pos
|
||||
current_length = end_pos
|
||||
end_pos = max_length_within_target + html.length
|
||||
if (max_length_within_target > 0 && end_pos > target_pos)
|
||||
return max_length_within_target
|
||||
end
|
||||
max_length_within_target = end_pos
|
||||
end
|
||||
nil
|
||||
rescue Nokogiri::SyntaxError
|
||||
@ -79,13 +81,15 @@ module DiscourseAi
|
||||
end
|
||||
|
||||
def self.find_nearest_bbcode_end_index(text, target_pos)
|
||||
max_length_within_target = 0
|
||||
BBCODE_PATTERNS.each do |pattern|
|
||||
text.scan(pattern) do |_|
|
||||
match = $~
|
||||
tag_start = match.begin(0)
|
||||
tag_end = match.end(0)
|
||||
|
||||
return tag_end if tag_start <= target_pos && tag_end > target_pos
|
||||
if (max_length_within_target > 0 && tag_end > target_pos)
|
||||
return max_length_within_target
|
||||
end
|
||||
max_length_within_target = tag_end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -11,11 +11,7 @@ module DiscourseAi
|
||||
return if post.raw.length > SiteSetting.ai_translation_max_post_length
|
||||
target_locale = target_locale.to_s.sub("-", "_")
|
||||
|
||||
translated_raw =
|
||||
ContentSplitter
|
||||
.split(post.raw)
|
||||
.map { |text| PostRawTranslator.new(text:, target_locale:, post:).translate }
|
||||
.join("")
|
||||
translated_raw = PostRawTranslator.new(text: post.raw, target_locale:, post:).translate
|
||||
|
||||
localization =
|
||||
PostLocalization.find_or_initialize_by(post_id: post.id, locale: target_locale)
|
||||
|
@ -1,7 +1,5 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require "rails_helper"
|
||||
|
||||
describe DiscourseAi::Translation::BaseTranslator do
|
||||
let!(:persona) do
|
||||
AiPersona.find(
|
||||
@ -28,7 +26,7 @@ describe DiscourseAi::Translation::BaseTranslator do
|
||||
DiscourseAi::Translation::PostRawTranslator.new(text:, target_locale:, post:)
|
||||
allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
|
||||
persona.system_prompt,
|
||||
messages: array_including({ type: :user, content: post_translator.formatted_content }),
|
||||
messages: array_including({ type: :user, content: a_string_including(text) }),
|
||||
post_id: post.id,
|
||||
topic_id: post.topic_id,
|
||||
).and_call_original
|
||||
|
@ -1,95 +1,92 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
describe DiscourseAi::Translation::ContentSplitter do
|
||||
let(:original_limit) { 4000 }
|
||||
|
||||
after { described_class.const_set(:CHUNK_SIZE, original_limit) }
|
||||
|
||||
def set_limit(value)
|
||||
described_class.const_set(:CHUNK_SIZE, value)
|
||||
end
|
||||
|
||||
it "returns empty array for empty input" do
|
||||
expect(described_class.split("")).to eq([""])
|
||||
expect(described_class.split(content: "")).to eq([""])
|
||||
end
|
||||
|
||||
it "handles content with only spaces" do
|
||||
expect(described_class.split(" ")).to eq([" "])
|
||||
expect(described_class.split(" ")).to eq([" "])
|
||||
expect(described_class.split(content: " ")).to eq([" "])
|
||||
expect(described_class.split(content: " ")).to eq([" "])
|
||||
end
|
||||
|
||||
it "handles nil input" do
|
||||
expect(described_class.split(nil)).to eq([])
|
||||
expect(described_class.split(content: nil)).to eq([])
|
||||
end
|
||||
|
||||
it "doesn't split content under limit" do
|
||||
text = "hello world"
|
||||
expect(described_class.split(text)).to eq([text])
|
||||
content = "hello world"
|
||||
expect(described_class.split(content:, chunk_size: 20)).to eq([content])
|
||||
end
|
||||
|
||||
it "splits to max chunk size if unsplittable" do
|
||||
content = "a" * 100
|
||||
expect(described_class.split(content:, chunk_size: 10)).to eq(["a" * 10] * 10)
|
||||
end
|
||||
|
||||
it "preserves HTML tags" do
|
||||
set_limit(10)
|
||||
text = "<p>hello</p><p>meow</p>"
|
||||
expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
|
||||
content = "<p>hello</p><p>meow</p>"
|
||||
expect(described_class.split(content:, chunk_size: 15)).to eq(%w[<p>hello</p> <p>meow</p>])
|
||||
|
||||
set_limit(35)
|
||||
text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
|
||||
expect(described_class.split(text)).to eq(
|
||||
["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
|
||||
content = "<div>hello</div> <div>jurassic</div> <p>world</p>"
|
||||
expect(described_class.split(content:, chunk_size: 40)).to eq(
|
||||
["<div>hello</div> <div>jurassic</div> ", "<p>world</p>"],
|
||||
)
|
||||
end
|
||||
|
||||
it "preserves BBCode tags" do
|
||||
set_limit(20)
|
||||
text = "[quote]hello[/quote][details]world[/details]"
|
||||
expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
|
||||
content = "[quote]hello[/quote][details]world[/details]"
|
||||
expect(described_class.split(content:, chunk_size: 25)).to eq(
|
||||
["[quote]hello[/quote]", "[details]world[/details]"],
|
||||
)
|
||||
end
|
||||
|
||||
it "doesn't split in middle of words" do
|
||||
set_limit(10)
|
||||
text = "my kitty best in the world"
|
||||
expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
|
||||
content = "my kitty best in the world"
|
||||
expect(described_class.split(content:, chunk_size: 10)).to eq(
|
||||
["my kitty ", "best in ", "the world"],
|
||||
)
|
||||
end
|
||||
|
||||
it "handles nested tags properly" do
|
||||
set_limit(25)
|
||||
text = "<div>hello<p>cat</p>world</div><p>meow</p>"
|
||||
expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
|
||||
content = "<div>hello<p>cat</p>world</div><p>meow</p>"
|
||||
expect(described_class.split(content:, chunk_size: 35)).to eq(
|
||||
%w[<div>hello<p>cat</p>world</div> <p>meow</p>],
|
||||
)
|
||||
end
|
||||
|
||||
it "handles mixed HTML and BBCode" do
|
||||
set_limit(15)
|
||||
text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
|
||||
expect(described_class.split(text)).to eq(
|
||||
content = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
|
||||
expect(described_class.split(content:, chunk_size: 20)).to eq(
|
||||
["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
|
||||
)
|
||||
end
|
||||
|
||||
it "preserves newlines in sensible places" do
|
||||
set_limit(10)
|
||||
text = "hello\nbeautiful\nworld\n"
|
||||
expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
|
||||
content = "hello\nbeautiful\nworld\n"
|
||||
expect(described_class.split(content:, chunk_size: 10)).to eq(
|
||||
["hello\n", "beautiful\n", "world\n"],
|
||||
)
|
||||
end
|
||||
|
||||
it "handles email content properly" do
|
||||
set_limit(20)
|
||||
text = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here"
|
||||
expect(described_class.split(text)).to eq(
|
||||
content = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here"
|
||||
expect(described_class.split(content:, chunk_size: 20)).to eq(
|
||||
["From: test@test.com\n", "To: other@test.com\n", "Subject: Hello\n\n", "Content here"],
|
||||
)
|
||||
end
|
||||
|
||||
it "keeps code blocks intact" do
|
||||
set_limit(30)
|
||||
text = "Text\n```\ncode block\nhere\n```\nmore text"
|
||||
expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
|
||||
content = "Text\n```\ncode block\nhere\n```\nmore text"
|
||||
expect(described_class.split(content:, chunk_size: 30)).to eq(
|
||||
["Text\n```\ncode block\nhere\n```\n", "more text"],
|
||||
)
|
||||
end
|
||||
|
||||
context "with multiple details tags" do
|
||||
it "splits correctly between details tags" do
|
||||
set_limit(30)
|
||||
text = "<details>first content</details><details>second content</details>"
|
||||
expect(described_class.split(text)).to eq(
|
||||
content = "<details>first content</details><details>second content</details>"
|
||||
expect(described_class.split(content:, chunk_size: 35)).to eq(
|
||||
["<details>first content</details>", "<details>second content</details>"],
|
||||
)
|
||||
end
|
||||
|
Loading…
x
Reference in New Issue
Block a user