DEV: Also detect locale of categories and do not translate if already in the locale (#1413)

Previously I had omitted to add `locale` to the category, as categories tended to be just a single word, and I did not find it would be worth to carry locale information.

Due to certain LLMs that do poorer at translation, category descriptions got pretty messy. We added locale support here - https://github.com/discourse/discourse/pull/32962. 

This PR adds the automatic locale detection, and skips translating to the category's locale.
This commit is contained in:
Natalie Tay 2025-06-06 22:41:48 +08:00 committed by GitHub
parent 6817866de9
commit 8a3a247b11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 267 additions and 20 deletions

View File

@ -1,3 +1,4 @@
< 3.5.0.beta6-dev: 3e74eea1e5e3143888d67a8d8a11206df214dc24
< 3.5.0.beta3-dev: 09a68414804a1447f52e5d60691ba59742cda9ec < 3.5.0.beta3-dev: 09a68414804a1447f52e5d60691ba59742cda9ec
< 3.5.0.beta2-dev: de8624416a15b3d8e7ad350b083cc1420451ccec < 3.5.0.beta2-dev: de8624416a15b3d8e7ad350b083cc1420451ccec
< 3.5.0.beta1-dev: bdef136080074a993e7c4f5ca562edc31a8ba756 < 3.5.0.beta1-dev: bdef136080074a993e7c4f5ca562edc31a8ba756

View File

@ -17,7 +17,8 @@ module Jobs
cat_id = args[:from_category_id] || Category.order(:id).first&.id cat_id = args[:from_category_id] || Category.order(:id).first&.id
last_id = nil last_id = nil
categories = Category.where("id >= ?", cat_id).order(:id).limit(BATCH_SIZE) categories =
Category.where("id >= ? AND locale IS NOT NULL", cat_id).order(:id).limit(BATCH_SIZE)
return if categories.empty? return if categories.empty?
categories.each do |category| categories.each do |category|
@ -26,9 +27,13 @@ module Jobs
next next
end end
CategoryLocalization.transaction do
locales.each do |locale| locales.each do |locale|
next if CategoryLocalization.exists?(category_id: category.id, locale: locale) localization = category.category_localizations.find_by(locale:)
if locale == category.locale && localization
localization.destroy
else
next if locale == category.locale
begin begin
DiscourseAi::Translation::CategoryLocalizer.localize(category, locale) DiscourseAi::Translation::CategoryLocalizer.localize(category, locale)
rescue FinalDestination::SSRFDetector::LookupFailedError rescue FinalDestination::SSRFDetector::LookupFailedError

View File

@ -0,0 +1,37 @@
# frozen_string_literal: true
module Jobs
class CategoriesLocaleDetectionBackfill < ::Jobs::Scheduled
every 1.hour
sidekiq_options retry: false
cluster_concurrency 1
def execute(args)
return if !SiteSetting.discourse_ai_enabled
return if !SiteSetting.ai_translation_enabled
return if SiteSetting.ai_translation_backfill_rate == 0
categories = Category.where(locale: nil)
if SiteSetting.ai_translation_backfill_limit_to_public_content
categories = categories.where(read_restricted: false)
end
categories = categories.limit(SiteSetting.ai_translation_backfill_rate)
return if categories.empty?
categories.each do |category|
begin
DiscourseAi::Translation::CategoryLocaleDetector.detect_locale(category)
rescue FinalDestination::SSRFDetector::LookupFailedError
rescue => e
DiscourseAi::Translation::VerboseLogger.log(
"Failed to detect category #{category.id}'s locale: #{e.message}",
)
end
end
DiscourseAi::Translation::VerboseLogger.log("Detected #{categories.size} category locales")
end
end
end

View File

@ -0,0 +1,19 @@
# frozen_string_literal: true
module DiscourseAi
module Translation
class CategoryLocaleDetector
def self.detect_locale(category)
return if category.blank?
text = [category.name, category.description].compact.join("\n\n")
return if text.blank?
detected_locale = LanguageDetector.new(text).detect
locale = LocaleNormalizer.normalize_to_i18n(detected_locale)
category.update_column(:locale, locale)
locale
end
end
end
end

View File

@ -9,9 +9,12 @@ module DiscourseAi
target_locale_sym = target_locale.to_s.sub("-", "_").to_sym target_locale_sym = target_locale.to_s.sub("-", "_").to_sym
translated_name = ShortTextTranslator.new(category.name, target_locale_sym).translate translated_name = ShortTextTranslator.new(category.name, target_locale_sym).translate
# category descriptions are first paragraphs of posts
translated_description = translated_description =
if category.description.present?
PostRawTranslator.new(category.description, target_locale_sym).translate PostRawTranslator.new(category.description, target_locale_sym).translate
else
""
end
localization = localization =
CategoryLocalization.find_or_initialize_by( CategoryLocalization.find_or_initialize_by(

View File

@ -53,7 +53,9 @@ describe Jobs::LocalizeCategories do
end end
it "translates categories to the configured locales" do it "translates categories to the configured locales" do
Category.update_all(locale: "en")
number_of_categories = Category.count number_of_categories = Category.count
DiscourseAi::Translation::CategoryLocalizer DiscourseAi::Translation::CategoryLocalizer
.expects(:localize) .expects(:localize)
.with(is_a(Category), "pt") .with(is_a(Category), "pt")
@ -69,20 +71,19 @@ describe Jobs::LocalizeCategories do
it "skips categories that already have localizations" do it "skips categories that already have localizations" do
localize_all_categories("pt", "zh_CN") localize_all_categories("pt", "zh_CN")
category1 = DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(is_a(Category), "pt").never
Fabricate(:category, name: "First Category", description: "First category description") DiscourseAi::Translation::CategoryLocalizer
Fabricate(:category_localization, category: category1, locale: "pt", name: "Primeira Categoria") .expects(:localize)
.with(is_a(Category), "zh_CN")
# It should only translate to Chinese, not Portuguese .never
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(category1, "pt").never
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(category1, "zh_CN").once
job.execute({}) job.execute({})
end end
it "continues from a specified category ID" do it "continues from a specified category ID" do
category1 = Fabricate(:category, name: "First", description: "First description") category1 = Fabricate(:category, name: "First", description: "First description", locale: "en")
category2 = Fabricate(:category, name: "Second", description: "Second description") category2 =
Fabricate(:category, name: "Second", description: "Second description", locale: "en")
DiscourseAi::Translation::CategoryLocalizer DiscourseAi::Translation::CategoryLocalizer
.expects(:localize) .expects(:localize)
@ -99,7 +100,7 @@ describe Jobs::LocalizeCategories do
it "handles translation errors gracefully" do it "handles translation errors gracefully" do
localize_all_categories("pt", "zh_CN") localize_all_categories("pt", "zh_CN")
category1 = Fabricate(:category, name: "First", description: "First description") category1 = Fabricate(:category, name: "First", description: "First description", locale: "en")
DiscourseAi::Translation::CategoryLocalizer DiscourseAi::Translation::CategoryLocalizer
.expects(:localize) .expects(:localize)
.with(category1, "pt") .with(category1, "pt")
@ -110,6 +111,8 @@ describe Jobs::LocalizeCategories do
end end
it "enqueues the next batch when there are more categories" do it "enqueues the next batch when there are more categories" do
Category.update_all(locale: "en")
Jobs.run_later! Jobs.run_later!
freeze_time freeze_time
Jobs::LocalizeCategories.const_set(:BATCH_SIZE, 1) Jobs::LocalizeCategories.const_set(:BATCH_SIZE, 1)
@ -134,10 +137,8 @@ describe Jobs::LocalizeCategories do
it "skips read-restricted categories when configured" do it "skips read-restricted categories when configured" do
SiteSetting.ai_translation_backfill_limit_to_public_content = true SiteSetting.ai_translation_backfill_limit_to_public_content = true
category1 = Fabricate(:category, name: "Public Category", read_restricted: false) category1 = Fabricate(:category, name: "Public Category", read_restricted: false, locale: "en")
category2 = Fabricate(:category, name: "Private Category", read_restricted: true) category2 = Fabricate(:category, name: "Private Category", read_restricted: true, locale: "en")
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).at_least_once
DiscourseAi::Translation::CategoryLocalizer DiscourseAi::Translation::CategoryLocalizer
.expects(:localize) .expects(:localize)
@ -150,4 +151,40 @@ describe Jobs::LocalizeCategories do
job.execute({}) job.execute({})
end end
it "skips creating localizations in the same language as the category's locale" do
Category.update_all(locale: "pt")
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(is_a(Category), "pt").never
DiscourseAi::Translation::CategoryLocalizer
.expects(:localize)
.with(is_a(Category), "zh_CN")
.times(Category.count)
job.execute({})
end
it "deletes existing localizations that match the category's locale" do
# update all categories to portuguese
Category.update_all(locale: "pt")
localize_all_categories("pt", "zh_CN")
expect { job.execute({}) }.to change { CategoryLocalization.exists?(locale: "pt") }.from(
true,
).to(false)
end
it "doesn't process categories with nil locale" do
# Add a category with nil locale
nil_locale_category = Fabricate(:category, name: "No Locale", locale: nil)
# Make sure our query for categories with non-null locales excludes it
DiscourseAi::Translation::CategoryLocalizer
.expects(:localize)
.with(nil_locale_category, any_parameters)
.never
job.execute({})
end
end end

View File

@ -0,0 +1,103 @@
# frozen_string_literal: true
describe Jobs::CategoriesLocaleDetectionBackfill do
fab!(:category) { Fabricate(:category, locale: nil) }
subject(:job) { described_class.new }
before do
SiteSetting.discourse_ai_enabled = true
Fabricate(:fake_model).tap do |fake_llm|
SiteSetting.public_send("ai_translation_model=", "custom:#{fake_llm.id}")
end
SiteSetting.ai_translation_enabled = true
SiteSetting.ai_translation_backfill_rate = 100
end
it "does nothing when AI is disabled" do
SiteSetting.discourse_ai_enabled = false
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
job.execute({})
end
it "does nothing when content translation is disabled" do
SiteSetting.ai_translation_enabled = false
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
job.execute({})
end
it "does nothing when backfill rate is 0" do
SiteSetting.ai_translation_backfill_rate = 0
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
job.execute({})
end
it "does nothing when there are no categories to detect" do
Category.update_all(locale: "en")
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
job.execute({})
end
it "detects locale for categories with nil locale" do
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).times(Category.count)
job.execute({})
end
it "handles detection errors gracefully" do
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).at_least_once
DiscourseAi::Translation::CategoryLocaleDetector
.expects(:detect_locale)
.with(category)
.raises(StandardError.new("error"))
.once
expect { job.execute({}) }.not_to raise_error
end
it "logs a summary after running" do
DiscourseAi::Translation::CategoryLocaleDetector.stubs(:detect_locale)
DiscourseAi::Translation::VerboseLogger.expects(:log).with(includes("Detected #{Category.count} category locales"))
job.execute({})
end
describe "with public content limitation" do
fab!(:private_category) { Fabricate(:private_category, group: Group[:staff], locale: nil) }
before do
# catch-all for other categories
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).at_least_once
SiteSetting.ai_translation_backfill_limit_to_public_content = true
end
it "only processes public categories" do
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(category).once
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(private_category).never
job.execute({})
end
it "processes all categories when setting is disabled" do
SiteSetting.ai_translation_backfill_limit_to_public_content = false
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(category).once
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(private_category).once
job.execute({})
end
end
it "limits processing to the backfill rate" do
SiteSetting.ai_translation_backfill_rate = 1
Fabricate(:category, locale: nil)
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).once
job.execute({})
end
end

View File

@ -0,0 +1,42 @@
# frozen_string_literal: true
describe DiscourseAi::Translation::CategoryLocaleDetector do
describe ".detect_locale" do
fab!(:category) { Fabricate(:category, name: "Hello world", description: "Welcome to this category", locale: nil) }
def language_detector_stub(opts)
mock = instance_double(DiscourseAi::Translation::LanguageDetector)
allow(DiscourseAi::Translation::LanguageDetector).to receive(:new).with(
opts[:text],
).and_return(mock)
allow(mock).to receive(:detect).and_return(opts[:locale])
end
it "returns nil if category is blank" do
expect(described_class.detect_locale(nil)).to eq(nil)
end
it "updates the category locale with the detected locale" do
text = "#{category.name}\n\n#{category.description}"
language_detector_stub({ text: text, locale: "zh_CN" })
expect { described_class.detect_locale(category) }.to change { category.reload.locale }.from(nil).to(
"zh_CN",
)
end
it "handles category with no description" do
no_description_category = Fabricate(:category, name: "Test Category", description: nil, locale: nil)
language_detector_stub({ text: no_description_category.name, locale: "fr" })
expect { described_class.detect_locale(no_description_category) }.to change { no_description_category.reload.locale }.from(nil).to("fr")
end
it "bypasses validations when updating locale" do
language_detector_stub({ text: "#{category.name}\n\n#{category.description}", locale: "zh_CN" })
described_class.detect_locale(category)
expect(category.reload.locale).to eq("zh_CN")
end
end
end