discourse-ai/lib/translation/post_detection_text.rb

41 lines
1.1 KiB
Ruby
Raw Normal View History

# frozen_string_literal: true
module DiscourseAi
module Translation
class PostDetectionText
NECESSARY_REMOVAL_SELECTORS = [
".lightbox-wrapper", # image captions
"blockquote, aside.quote", # quotes
]
OPTIONAL_SELECTORS = [
"a.hashtag-cooked", # categories or tags are usually in site's language
"a.mention", # mentions are based on the mentioned's user's name
"aside.onebox", # onebox external content
"img.emoji",
"code, pre",
]
def self.get_text(post)
return if post.blank?
cooked = post.cooked
return if cooked.blank?
doc = Nokogiri::HTML5.fragment(cooked)
original = doc.text.strip
# these selectors should be removed,
# as they are the usual culprits for incorrect detection
doc.css(*NECESSARY_REMOVAL_SELECTORS).remove
necessary = doc.text.strip
doc.css(*OPTIONAL_SELECTORS).remove
preferred = doc.text.strip
return preferred if preferred.present?
return necessary if necessary.present?
original
end
end
end
end