mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-06-22 15:42:14 +00:00
DEV: Also detect locale of categories and do not translate if already in the locale (#1413)
Previously I had omitted to add `locale` to the category, as categories tended to be just a single word, and I did not find it would be worth to carry locale information. Due to certain LLMs that do poorer at translation, category descriptions got pretty messy. We added locale support here - https://github.com/discourse/discourse/pull/32962. This PR adds the automatic locale detection, and skips translating to the category's locale.
This commit is contained in:
parent
6817866de9
commit
8a3a247b11
@ -1,3 +1,4 @@
|
|||||||
|
< 3.5.0.beta6-dev: 3e74eea1e5e3143888d67a8d8a11206df214dc24
|
||||||
< 3.5.0.beta3-dev: 09a68414804a1447f52e5d60691ba59742cda9ec
|
< 3.5.0.beta3-dev: 09a68414804a1447f52e5d60691ba59742cda9ec
|
||||||
< 3.5.0.beta2-dev: de8624416a15b3d8e7ad350b083cc1420451ccec
|
< 3.5.0.beta2-dev: de8624416a15b3d8e7ad350b083cc1420451ccec
|
||||||
< 3.5.0.beta1-dev: bdef136080074a993e7c4f5ca562edc31a8ba756
|
< 3.5.0.beta1-dev: bdef136080074a993e7c4f5ca562edc31a8ba756
|
||||||
|
@ -17,7 +17,8 @@ module Jobs
|
|||||||
cat_id = args[:from_category_id] || Category.order(:id).first&.id
|
cat_id = args[:from_category_id] || Category.order(:id).first&.id
|
||||||
last_id = nil
|
last_id = nil
|
||||||
|
|
||||||
categories = Category.where("id >= ?", cat_id).order(:id).limit(BATCH_SIZE)
|
categories =
|
||||||
|
Category.where("id >= ? AND locale IS NOT NULL", cat_id).order(:id).limit(BATCH_SIZE)
|
||||||
return if categories.empty?
|
return if categories.empty?
|
||||||
|
|
||||||
categories.each do |category|
|
categories.each do |category|
|
||||||
@ -26,9 +27,13 @@ module Jobs
|
|||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
CategoryLocalization.transaction do
|
|
||||||
locales.each do |locale|
|
locales.each do |locale|
|
||||||
next if CategoryLocalization.exists?(category_id: category.id, locale: locale)
|
localization = category.category_localizations.find_by(locale:)
|
||||||
|
|
||||||
|
if locale == category.locale && localization
|
||||||
|
localization.destroy
|
||||||
|
else
|
||||||
|
next if locale == category.locale
|
||||||
begin
|
begin
|
||||||
DiscourseAi::Translation::CategoryLocalizer.localize(category, locale)
|
DiscourseAi::Translation::CategoryLocalizer.localize(category, locale)
|
||||||
rescue FinalDestination::SSRFDetector::LookupFailedError
|
rescue FinalDestination::SSRFDetector::LookupFailedError
|
||||||
|
37
app/jobs/scheduled/categories_locale_detection_backfill.rb
Normal file
37
app/jobs/scheduled/categories_locale_detection_backfill.rb
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module Jobs
|
||||||
|
class CategoriesLocaleDetectionBackfill < ::Jobs::Scheduled
|
||||||
|
every 1.hour
|
||||||
|
sidekiq_options retry: false
|
||||||
|
cluster_concurrency 1
|
||||||
|
|
||||||
|
def execute(args)
|
||||||
|
return if !SiteSetting.discourse_ai_enabled
|
||||||
|
return if !SiteSetting.ai_translation_enabled
|
||||||
|
return if SiteSetting.ai_translation_backfill_rate == 0
|
||||||
|
|
||||||
|
categories = Category.where(locale: nil)
|
||||||
|
|
||||||
|
if SiteSetting.ai_translation_backfill_limit_to_public_content
|
||||||
|
categories = categories.where(read_restricted: false)
|
||||||
|
end
|
||||||
|
|
||||||
|
categories = categories.limit(SiteSetting.ai_translation_backfill_rate)
|
||||||
|
return if categories.empty?
|
||||||
|
|
||||||
|
categories.each do |category|
|
||||||
|
begin
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.detect_locale(category)
|
||||||
|
rescue FinalDestination::SSRFDetector::LookupFailedError
|
||||||
|
rescue => e
|
||||||
|
DiscourseAi::Translation::VerboseLogger.log(
|
||||||
|
"Failed to detect category #{category.id}'s locale: #{e.message}",
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
DiscourseAi::Translation::VerboseLogger.log("Detected #{categories.size} category locales")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
19
lib/translation/category_locale_detector.rb
Normal file
19
lib/translation/category_locale_detector.rb
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module DiscourseAi
|
||||||
|
module Translation
|
||||||
|
class CategoryLocaleDetector
|
||||||
|
def self.detect_locale(category)
|
||||||
|
return if category.blank?
|
||||||
|
|
||||||
|
text = [category.name, category.description].compact.join("\n\n")
|
||||||
|
return if text.blank?
|
||||||
|
|
||||||
|
detected_locale = LanguageDetector.new(text).detect
|
||||||
|
locale = LocaleNormalizer.normalize_to_i18n(detected_locale)
|
||||||
|
category.update_column(:locale, locale)
|
||||||
|
locale
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
@ -9,9 +9,12 @@ module DiscourseAi
|
|||||||
target_locale_sym = target_locale.to_s.sub("-", "_").to_sym
|
target_locale_sym = target_locale.to_s.sub("-", "_").to_sym
|
||||||
|
|
||||||
translated_name = ShortTextTranslator.new(category.name, target_locale_sym).translate
|
translated_name = ShortTextTranslator.new(category.name, target_locale_sym).translate
|
||||||
# category descriptions are first paragraphs of posts
|
|
||||||
translated_description =
|
translated_description =
|
||||||
|
if category.description.present?
|
||||||
PostRawTranslator.new(category.description, target_locale_sym).translate
|
PostRawTranslator.new(category.description, target_locale_sym).translate
|
||||||
|
else
|
||||||
|
""
|
||||||
|
end
|
||||||
|
|
||||||
localization =
|
localization =
|
||||||
CategoryLocalization.find_or_initialize_by(
|
CategoryLocalization.find_or_initialize_by(
|
||||||
|
@ -53,7 +53,9 @@ describe Jobs::LocalizeCategories do
|
|||||||
end
|
end
|
||||||
|
|
||||||
it "translates categories to the configured locales" do
|
it "translates categories to the configured locales" do
|
||||||
|
Category.update_all(locale: "en")
|
||||||
number_of_categories = Category.count
|
number_of_categories = Category.count
|
||||||
|
|
||||||
DiscourseAi::Translation::CategoryLocalizer
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
.expects(:localize)
|
.expects(:localize)
|
||||||
.with(is_a(Category), "pt")
|
.with(is_a(Category), "pt")
|
||||||
@ -69,20 +71,19 @@ describe Jobs::LocalizeCategories do
|
|||||||
it "skips categories that already have localizations" do
|
it "skips categories that already have localizations" do
|
||||||
localize_all_categories("pt", "zh_CN")
|
localize_all_categories("pt", "zh_CN")
|
||||||
|
|
||||||
category1 =
|
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(is_a(Category), "pt").never
|
||||||
Fabricate(:category, name: "First Category", description: "First category description")
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
Fabricate(:category_localization, category: category1, locale: "pt", name: "Primeira Categoria")
|
.expects(:localize)
|
||||||
|
.with(is_a(Category), "zh_CN")
|
||||||
# It should only translate to Chinese, not Portuguese
|
.never
|
||||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(category1, "pt").never
|
|
||||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(category1, "zh_CN").once
|
|
||||||
|
|
||||||
job.execute({})
|
job.execute({})
|
||||||
end
|
end
|
||||||
|
|
||||||
it "continues from a specified category ID" do
|
it "continues from a specified category ID" do
|
||||||
category1 = Fabricate(:category, name: "First", description: "First description")
|
category1 = Fabricate(:category, name: "First", description: "First description", locale: "en")
|
||||||
category2 = Fabricate(:category, name: "Second", description: "Second description")
|
category2 =
|
||||||
|
Fabricate(:category, name: "Second", description: "Second description", locale: "en")
|
||||||
|
|
||||||
DiscourseAi::Translation::CategoryLocalizer
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
.expects(:localize)
|
.expects(:localize)
|
||||||
@ -99,7 +100,7 @@ describe Jobs::LocalizeCategories do
|
|||||||
it "handles translation errors gracefully" do
|
it "handles translation errors gracefully" do
|
||||||
localize_all_categories("pt", "zh_CN")
|
localize_all_categories("pt", "zh_CN")
|
||||||
|
|
||||||
category1 = Fabricate(:category, name: "First", description: "First description")
|
category1 = Fabricate(:category, name: "First", description: "First description", locale: "en")
|
||||||
DiscourseAi::Translation::CategoryLocalizer
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
.expects(:localize)
|
.expects(:localize)
|
||||||
.with(category1, "pt")
|
.with(category1, "pt")
|
||||||
@ -110,6 +111,8 @@ describe Jobs::LocalizeCategories do
|
|||||||
end
|
end
|
||||||
|
|
||||||
it "enqueues the next batch when there are more categories" do
|
it "enqueues the next batch when there are more categories" do
|
||||||
|
Category.update_all(locale: "en")
|
||||||
|
|
||||||
Jobs.run_later!
|
Jobs.run_later!
|
||||||
freeze_time
|
freeze_time
|
||||||
Jobs::LocalizeCategories.const_set(:BATCH_SIZE, 1)
|
Jobs::LocalizeCategories.const_set(:BATCH_SIZE, 1)
|
||||||
@ -134,10 +137,8 @@ describe Jobs::LocalizeCategories do
|
|||||||
it "skips read-restricted categories when configured" do
|
it "skips read-restricted categories when configured" do
|
||||||
SiteSetting.ai_translation_backfill_limit_to_public_content = true
|
SiteSetting.ai_translation_backfill_limit_to_public_content = true
|
||||||
|
|
||||||
category1 = Fabricate(:category, name: "Public Category", read_restricted: false)
|
category1 = Fabricate(:category, name: "Public Category", read_restricted: false, locale: "en")
|
||||||
category2 = Fabricate(:category, name: "Private Category", read_restricted: true)
|
category2 = Fabricate(:category, name: "Private Category", read_restricted: true, locale: "en")
|
||||||
|
|
||||||
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).at_least_once
|
|
||||||
|
|
||||||
DiscourseAi::Translation::CategoryLocalizer
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
.expects(:localize)
|
.expects(:localize)
|
||||||
@ -150,4 +151,40 @@ describe Jobs::LocalizeCategories do
|
|||||||
|
|
||||||
job.execute({})
|
job.execute({})
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "skips creating localizations in the same language as the category's locale" do
|
||||||
|
Category.update_all(locale: "pt")
|
||||||
|
|
||||||
|
DiscourseAi::Translation::CategoryLocalizer.expects(:localize).with(is_a(Category), "pt").never
|
||||||
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
|
.expects(:localize)
|
||||||
|
.with(is_a(Category), "zh_CN")
|
||||||
|
.times(Category.count)
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "deletes existing localizations that match the category's locale" do
|
||||||
|
# update all categories to portuguese
|
||||||
|
Category.update_all(locale: "pt")
|
||||||
|
|
||||||
|
localize_all_categories("pt", "zh_CN")
|
||||||
|
|
||||||
|
expect { job.execute({}) }.to change { CategoryLocalization.exists?(locale: "pt") }.from(
|
||||||
|
true,
|
||||||
|
).to(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "doesn't process categories with nil locale" do
|
||||||
|
# Add a category with nil locale
|
||||||
|
nil_locale_category = Fabricate(:category, name: "No Locale", locale: nil)
|
||||||
|
|
||||||
|
# Make sure our query for categories with non-null locales excludes it
|
||||||
|
DiscourseAi::Translation::CategoryLocalizer
|
||||||
|
.expects(:localize)
|
||||||
|
.with(nil_locale_category, any_parameters)
|
||||||
|
.never
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
103
spec/jobs/scheduled/categories_locale_detection_backfill_spec.rb
Normal file
103
spec/jobs/scheduled/categories_locale_detection_backfill_spec.rb
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
describe Jobs::CategoriesLocaleDetectionBackfill do
|
||||||
|
fab!(:category) { Fabricate(:category, locale: nil) }
|
||||||
|
subject(:job) { described_class.new }
|
||||||
|
|
||||||
|
before do
|
||||||
|
SiteSetting.discourse_ai_enabled = true
|
||||||
|
Fabricate(:fake_model).tap do |fake_llm|
|
||||||
|
SiteSetting.public_send("ai_translation_model=", "custom:#{fake_llm.id}")
|
||||||
|
end
|
||||||
|
SiteSetting.ai_translation_enabled = true
|
||||||
|
SiteSetting.ai_translation_backfill_rate = 100
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does nothing when AI is disabled" do
|
||||||
|
SiteSetting.discourse_ai_enabled = false
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does nothing when content translation is disabled" do
|
||||||
|
SiteSetting.ai_translation_enabled = false
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does nothing when backfill rate is 0" do
|
||||||
|
SiteSetting.ai_translation_backfill_rate = 0
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does nothing when there are no categories to detect" do
|
||||||
|
Category.update_all(locale: "en")
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).never
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "detects locale for categories with nil locale" do
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).times(Category.count)
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "handles detection errors gracefully" do
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).at_least_once
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector
|
||||||
|
.expects(:detect_locale)
|
||||||
|
.with(category)
|
||||||
|
.raises(StandardError.new("error"))
|
||||||
|
.once
|
||||||
|
|
||||||
|
expect { job.execute({}) }.not_to raise_error
|
||||||
|
end
|
||||||
|
|
||||||
|
it "logs a summary after running" do
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.stubs(:detect_locale)
|
||||||
|
DiscourseAi::Translation::VerboseLogger.expects(:log).with(includes("Detected #{Category.count} category locales"))
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
describe "with public content limitation" do
|
||||||
|
fab!(:private_category) { Fabricate(:private_category, group: Group[:staff], locale: nil) }
|
||||||
|
|
||||||
|
before do
|
||||||
|
# catch-all for other categories
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(is_a(Category)).at_least_once
|
||||||
|
|
||||||
|
SiteSetting.ai_translation_backfill_limit_to_public_content = true
|
||||||
|
end
|
||||||
|
|
||||||
|
it "only processes public categories" do
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(category).once
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(private_category).never
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
|
||||||
|
it "processes all categories when setting is disabled" do
|
||||||
|
SiteSetting.ai_translation_backfill_limit_to_public_content = false
|
||||||
|
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(category).once
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).with(private_category).once
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
it "limits processing to the backfill rate" do
|
||||||
|
SiteSetting.ai_translation_backfill_rate = 1
|
||||||
|
Fabricate(:category, locale: nil)
|
||||||
|
|
||||||
|
DiscourseAi::Translation::CategoryLocaleDetector.expects(:detect_locale).once
|
||||||
|
|
||||||
|
job.execute({})
|
||||||
|
end
|
||||||
|
end
|
42
spec/lib/translation/category_locale_detector_spec.rb
Normal file
42
spec/lib/translation/category_locale_detector_spec.rb
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
describe DiscourseAi::Translation::CategoryLocaleDetector do
|
||||||
|
describe ".detect_locale" do
|
||||||
|
fab!(:category) { Fabricate(:category, name: "Hello world", description: "Welcome to this category", locale: nil) }
|
||||||
|
|
||||||
|
def language_detector_stub(opts)
|
||||||
|
mock = instance_double(DiscourseAi::Translation::LanguageDetector)
|
||||||
|
allow(DiscourseAi::Translation::LanguageDetector).to receive(:new).with(
|
||||||
|
opts[:text],
|
||||||
|
).and_return(mock)
|
||||||
|
allow(mock).to receive(:detect).and_return(opts[:locale])
|
||||||
|
end
|
||||||
|
|
||||||
|
it "returns nil if category is blank" do
|
||||||
|
expect(described_class.detect_locale(nil)).to eq(nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "updates the category locale with the detected locale" do
|
||||||
|
text = "#{category.name}\n\n#{category.description}"
|
||||||
|
language_detector_stub({ text: text, locale: "zh_CN" })
|
||||||
|
|
||||||
|
expect { described_class.detect_locale(category) }.to change { category.reload.locale }.from(nil).to(
|
||||||
|
"zh_CN",
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "handles category with no description" do
|
||||||
|
no_description_category = Fabricate(:category, name: "Test Category", description: nil, locale: nil)
|
||||||
|
language_detector_stub({ text: no_description_category.name, locale: "fr" })
|
||||||
|
|
||||||
|
expect { described_class.detect_locale(no_description_category) }.to change { no_description_category.reload.locale }.from(nil).to("fr")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "bypasses validations when updating locale" do
|
||||||
|
language_detector_stub({ text: "#{category.name}\n\n#{category.description}", locale: "zh_CN" })
|
||||||
|
|
||||||
|
described_class.detect_locale(category)
|
||||||
|
expect(category.reload.locale).to eq("zh_CN")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
x
Reference in New Issue
Block a user