mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-07-23 14:33:28 +00:00
A more deterministic way of making sure the LLM detects the correct language (instead of relying on prompt to LLM to ignore it) is to take the cooked and remove unwanted elements. In this commit - we remove quotes, image captions, etc. and only take the remaining text, falling back to the unadulterated cooked - and update prompts related to detection and translation - /152465/12
41 lines
1.1 KiB
Ruby
41 lines
1.1 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Translation
|
|
class PostDetectionText
|
|
NECESSARY_REMOVAL_SELECTORS = [
|
|
".lightbox-wrapper", # image captions
|
|
"blockquote, aside.quote", # quotes
|
|
]
|
|
OPTIONAL_SELECTORS = [
|
|
"a.hashtag-cooked", # categories or tags are usually in site's language
|
|
"a.mention", # mentions are based on the mentioned's user's name
|
|
"aside.onebox", # onebox external content
|
|
"img.emoji",
|
|
"code, pre",
|
|
]
|
|
|
|
def self.get_text(post)
|
|
return if post.blank?
|
|
cooked = post.cooked
|
|
return if cooked.blank?
|
|
|
|
doc = Nokogiri::HTML5.fragment(cooked)
|
|
original = doc.text.strip
|
|
|
|
# these selectors should be removed,
|
|
# as they are the usual culprits for incorrect detection
|
|
doc.css(*NECESSARY_REMOVAL_SELECTORS).remove
|
|
necessary = doc.text.strip
|
|
|
|
doc.css(*OPTIONAL_SELECTORS).remove
|
|
preferred = doc.text.strip
|
|
|
|
return preferred if preferred.present?
|
|
return necessary if necessary.present?
|
|
original
|
|
end
|
|
end
|
|
end
|
|
end
|