mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-06-21 23:22:21 +00:00
DEV: Also detect locale of categories and do not translate if already in the locale (#1413)
Previously I had omitted to add `locale` to the category, as categories tended to be just a single word, and I did not find it would be worth to carry locale information. Due to certain LLMs that do poorer at translation, category descriptions got pretty messy. We added locale support here - https://github.com/discourse/discourse/pull/32962. This PR adds the automatic locale detection, and skips translating to the category's locale.
This commit is contained in:
parent
6817866de9
commit
8a3a247b11
@ -1,3 +1,4 @@
|
||||
< 3.5.0.beta6-dev: 3e74eea1e5e3143888d67a8d8a11206df214dc24
|
||||
< 3.5.0.beta3-dev: 09a68414804a1447f52e5d60691ba59742cda9ec
|
||||
< 3.5.0.beta2-dev: de8624416a15b3d8e7ad350b083cc1420451ccec
|
||||
< 3.5.0.beta1-dev: bdef136080074a993e7c4f5ca562edc31a8ba756
|
||||
|
@ -17,7 +17,8 @@ module Jobs
|
||||
cat_id = args[:from_category_id] || Category.order(:id).first&.id
|
||||
last_id = nil
|
||||
|
||||
categories = Category.where("id >= ?", cat_id).order(:id).limit(BATCH_SIZE)
|
||||
categories =
|
||||
Category.where("id >= ? AND locale IS NOT NULL", cat_id).order(:id).limit(BATCH_SIZE)
|
||||
return if categories.empty?
|
||||
|
||||
categories.each do |category|
|
||||
@ -26,9 +27,13 @@ module Jobs
|
||||
next
|
||||
end
|
||||
|
||||
CategoryLocalization.transaction do
|
||||
locales.each do |locale|
|
||||
next if CategoryLocalization.exists?(category_id: category.id, locale: locale)
|
||||
locales.each do |locale|
|
||||
localization = category.category_localizations.find_by(locale:)
|
||||
|
||||
if locale == category.locale && localization
|
||||
localization.destroy
|
||||
else
|
||||
next if locale == category.locale
|
||||
begin
|
||||
DiscourseAi::Translation::CategoryLocalizer.localize(category, locale)
|
||||
rescue FinalDestination::SSRFDetector::LookupFailedError
|
||||
|
37
app/jobs/scheduled/categories_locale_detection_backfill.rb
Normal file
37
app/jobs/scheduled/categories_locale_detection_backfill.rb
Normal file
@ -0,0 +1,37 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
module Jobs
|
||||
class CategoriesLocaleDetectionBackfill < ::Jobs::Scheduled
|
||||
every 1.hour
|
||||
sidekiq_options retry: false
|
||||
cluster_concurrency 1
|
||||
|
||||
def execute(args)
|
||||
return if !SiteSetting.discourse_ai_enabled
|
||||
return if !SiteSetting.ai_translation_enabled
|
||||
return if SiteSetting.ai_translation_backfill_rate == 0
|
||||
|
||||
categories = Category.where(locale: nil)
|
||||
|
||||
if SiteSetting.ai_translation_backfill_limit_to_public_content
|
||||
categories = categories.where(read_restricted: false)
|
||||
end
|
||||
|
||||
categories = categories.limit(SiteSetting.ai_translation_backfill_rate)
|
||||
return if categories.empty?
|
||||
|
||||
categories.each do |category|
|
||||
begin
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.detect_locale(category)
|
||||
rescue FinalDestination::SSRFDetector::LookupFailedError
|
||||
rescue => e
|
||||
DiscourseAi::Translation::VerboseLogger.log(
|
||||
"Failed to detect category #{category.id}'s locale: #{e.message}",
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
DiscourseAi::Translation::VerboseLogger.log("Detected #{categories.size} category locales")
|
||||
end
|
||||
end
|
||||
end
|
19
lib/translation/category_locale_detector.rb
Normal file
19
lib/translation/category_locale_detector.rb
Normal file
@ -0,0 +1,19 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
module DiscourseAi
|
||||
module Translation
|
||||
class CategoryLocaleDetector
|
||||
def self.detect_locale(category)
|
||||
return if category.blank?
|
||||
|
||||
text = [category.name, category.description].compact.join("\n\n")
|
||||
return if text.blank?
|
||||
|
||||
detected_locale = LanguageDetector.new(text).detect
|
||||
locale = LocaleNormalizer.normalize_to_i18n(detected_locale)
|
||||
category.update_column(:locale, locale)
|
||||
locale
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
@ -9,9 +9,12 @@ module DiscourseAi
|
||||
target_locale_sym = target_locale.to_s.sub("-", "_").to_sym
|
||||
|
||||
translated_name = ShortTextTranslator.new(category.name, target_locale_sym).translate
|
||||
# category descriptions are first paragraphs of posts
|
||||
translated_description =
|
||||
PostRawTranslator.new(category.description, target_locale_sym).translate
|
||||
if category.description.present?
|
||||
PostRawTranslator.new(category.description, target_locale_sym).translate
|
||||
else
|
||||
""
|
||||
end
|
||||
|
||||
localization =
|
||||
CategoryLocalization.find_or_initialize_by(
|
||||
|
@ -53,7 +53,9 @@ describe Jobs::LocalizeCategories do
|
||||
end
|
||||
|
||||
it "translates categories to the configured locales" do
|
||||
Category.update_all(locale: "en")
|
||||
number_of_categories = Category.count
|
||||
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
.with(is_a(Category), "pt")
|
||||
@ -69,20 +71,19 @@ describe Jobs::LocalizeCategories do
|
||||
it "skips categories that already have localizations" do
|
||||
localize_all_categories("pt", "zh_CN")
|
||||
|
||||
category1 =
|
||||
Fabricate(:category, name: "First Category", description: "First category description")
|
||||
Fabricate(:category_localization, category: category1, locale: "pt", name: "Primeira Categoria")
|
||||
|
||||
# It should only translate to Chinese, not Portuguese
|
||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(category1, "pt").never
|
||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(category1, "zh_CN").once
|
||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(is_a(Category), "pt").never
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
.with(is_a(Category), "zh_CN")
|
||||
.never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "continues from a specified category ID" do
|
||||
category1 = Fabricate(:category, name: "First", description: "First description")
|
||||
category2 = Fabricate(:category, name: "Second", description: "Second description")
|
||||
category1 = Fabricate(:category, name: "First", description: "First description", locale: "en")
|
||||
category2 =
|
||||
Fabricate(:category, name: "Second", description: "Second description", locale: "en")
|
||||
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
@ -99,7 +100,7 @@ describe Jobs::LocalizeCategories do
|
||||
it "handles translation errors gracefully" do
|
||||
localize_all_categories("pt", "zh_CN")
|
||||
|
||||
category1 = Fabricate(:category, name: "First", description: "First description")
|
||||
category1 = Fabricate(:category, name: "First", description: "First description", locale: "en")
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
.with(category1, "pt")
|
||||
@ -110,6 +111,8 @@ describe Jobs::LocalizeCategories do
|
||||
end
|
||||
|
||||
it "enqueues the next batch when there are more categories" do
|
||||
Category.update_all(locale: "en")
|
||||
|
||||
Jobs.run_later!
|
||||
freeze_time
|
||||
Jobs::LocalizeCategories.const_set(:BATCH_SIZE, 1)
|
||||
@ -134,10 +137,8 @@ describe Jobs::LocalizeCategories do
|
||||
it "skips read-restricted categories when configured" do
|
||||
SiteSetting.ai_translation_backfill_limit_to_public_content = true
|
||||
|
||||
category1 = Fabricate(:category, name: "Public Category", read_restricted: false)
|
||||
category2 = Fabricate(:category, name: "Private Category", read_restricted: true)
|
||||
|
||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).at_least_once
|
||||
category1 = Fabricate(:category, name: "Public Category", read_restricted: false, locale: "en")
|
||||
category2 = Fabricate(:category, name: "Private Category", read_restricted: true, locale: "en")
|
||||
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
@ -150,4 +151,40 @@ describe Jobs::LocalizeCategories do
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "skips creating localizations in the same language as the category's locale" do
|
||||
Category.update_all(locale: "pt")
|
||||
|
||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(is_a(Category), "pt").never
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
.with(is_a(Category), "zh_CN")
|
||||
.times(Category.count)
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "deletes existing localizations that match the category's locale" do
|
||||
# update all categories to portuguese
|
||||
Category.update_all(locale: "pt")
|
||||
|
||||
localize_all_categories("pt", "zh_CN")
|
||||
|
||||
expect { job.execute({}) }.to change { CategoryLocalization.exists?(locale: "pt") }.from(
|
||||
true,
|
||||
).to(false)
|
||||
end
|
||||
|
||||
it "doesn't process categories with nil locale" do
|
||||
# Add a category with nil locale
|
||||
nil_locale_category = Fabricate(:category, name: "No Locale", locale: nil)
|
||||
|
||||
# Make sure our query for categories with non-null locales excludes it
|
||||
DiscourseAi::Translation::CategoryLocalizer
|
||||
.expects(:localize)
|
||||
.with(nil_locale_category, any_parameters)
|
||||
.never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
end
|
||||
|
103
spec/jobs/scheduled/categories_locale_detection_backfill_spec.rb
Normal file
103
spec/jobs/scheduled/categories_locale_detection_backfill_spec.rb
Normal file
@ -0,0 +1,103 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
describe Jobs::CategoriesLocaleDetectionBackfill do
|
||||
fab!(:category) { Fabricate(:category, locale: nil) }
|
||||
subject(:job) { described_class.new }
|
||||
|
||||
before do
|
||||
SiteSetting.discourse_ai_enabled = true
|
||||
Fabricate(:fake_model).tap do |fake_llm|
|
||||
SiteSetting.public_send("ai_translation_model=", "custom:#{fake_llm.id}")
|
||||
end
|
||||
SiteSetting.ai_translation_enabled = true
|
||||
SiteSetting.ai_translation_backfill_rate = 100
|
||||
end
|
||||
|
||||
it "does nothing when AI is disabled" do
|
||||
SiteSetting.discourse_ai_enabled = false
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "does nothing when content translation is disabled" do
|
||||
SiteSetting.ai_translation_enabled = false
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "does nothing when backfill rate is 0" do
|
||||
SiteSetting.ai_translation_backfill_rate = 0
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "does nothing when there are no categories to detect" do
|
||||
Category.update_all(locale: "en")
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "detects locale for categories with nil locale" do
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).times(Category.count)
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "handles detection errors gracefully" do
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).at_least_once
|
||||
DiscourseAi::Translation::CategoryLocaleDetector
|
||||
.expects(:detect_locale)
|
||||
.with(category)
|
||||
.raises(StandardError.new("error"))
|
||||
.once
|
||||
|
||||
expect { job.execute({}) }.not_to raise_error
|
||||
end
|
||||
|
||||
it "logs a summary after running" do
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.stubs(:detect_locale)
|
||||
DiscourseAi::Translation::VerboseLogger.expects(:log).with(includes("Detected #{Category.count} category locales"))
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
describe "with public content limitation" do
|
||||
fab!(:private_category) { Fabricate(:private_category, group: Group[:staff], locale: nil) }
|
||||
|
||||
before do
|
||||
# catch-all for other categories
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).at_least_once
|
||||
|
||||
SiteSetting.ai_translation_backfill_limit_to_public_content = true
|
||||
end
|
||||
|
||||
it "only processes public categories" do
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(category).once
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(private_category).never
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
|
||||
it "processes all categories when setting is disabled" do
|
||||
SiteSetting.ai_translation_backfill_limit_to_public_content = false
|
||||
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(category).once
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(private_category).once
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
end
|
||||
|
||||
it "limits processing to the backfill rate" do
|
||||
SiteSetting.ai_translation_backfill_rate = 1
|
||||
Fabricate(:category, locale: nil)
|
||||
|
||||
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).once
|
||||
|
||||
job.execute({})
|
||||
end
|
||||
end
|
42
spec/lib/translation/category_locale_detector_spec.rb
Normal file
42
spec/lib/translation/category_locale_detector_spec.rb
Normal file
@ -0,0 +1,42 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
describe DiscourseAi::Translation::CategoryLocaleDetector do
|
||||
describe ".detect_locale" do
|
||||
fab!(:category) { Fabricate(:category, name: "Hello world", description: "Welcome to this category", locale: nil) }
|
||||
|
||||
def language_detector_stub(opts)
|
||||
mock = instance_double(DiscourseAi::Translation::LanguageDetector)
|
||||
allow(DiscourseAi::Translation::LanguageDetector).to receive(:new).with(
|
||||
opts[:text],
|
||||
).and_return(mock)
|
||||
allow(mock).to receive(:detect).and_return(opts[:locale])
|
||||
end
|
||||
|
||||
it "returns nil if category is blank" do
|
||||
expect(described_class.detect_locale(nil)).to eq(nil)
|
||||
end
|
||||
|
||||
it "updates the category locale with the detected locale" do
|
||||
text = "#{category.name}\n\n#{category.description}"
|
||||
language_detector_stub({ text: text, locale: "zh_CN" })
|
||||
|
||||
expect { described_class.detect_locale(category) }.to change { category.reload.locale }.from(nil).to(
|
||||
"zh_CN",
|
||||
)
|
||||
end
|
||||
|
||||
it "handles category with no description" do
|
||||
no_description_category = Fabricate(:category, name: "Test Category", description: nil, locale: nil)
|
||||
language_detector_stub({ text: no_description_category.name, locale: "fr" })
|
||||
|
||||
expect { described_class.detect_locale(no_description_category) }.to change { no_description_category.reload.locale }.from(nil).to("fr")
|
||||
end
|
||||
|
||||
it "bypasses validations when updating locale" do
|
||||
language_detector_stub({ text: "#{category.name}\n\n#{category.description}", locale: "zh_CN" })
|
||||
|
||||
described_class.detect_locale(category)
|
||||
expect(category.reload.locale).to eq("zh_CN")
|
||||
end
|
||||
end
|
||||
end
|
Loading…
x
Reference in New Issue
Block a user