mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-07-24 23:13:27 +00:00
41 lines
1.1 KiB
Ruby
41 lines
1.1 KiB
Ruby
|
# frozen_string_literal: true
|
||
|
|
||
|
module DiscourseAi
|
||
|
module Translation
|
||
|
class PostDetectionText
|
||
|
NECESSARY_REMOVAL_SELECTORS = [
|
||
|
".lightbox-wrapper", # image captions
|
||
|
"blockquote, aside.quote", # quotes
|
||
|
]
|
||
|
OPTIONAL_SELECTORS = [
|
||
|
"a.hashtag-cooked", # categories or tags are usually in site's language
|
||
|
"a.mention", # mentions are based on the mentioned's user's name
|
||
|
"aside.onebox", # onebox external content
|
||
|
"img.emoji",
|
||
|
"code, pre",
|
||
|
]
|
||
|
|
||
|
def self.get_text(post)
|
||
|
return if post.blank?
|
||
|
cooked = post.cooked
|
||
|
return if cooked.blank?
|
||
|
|
||
|
doc = Nokogiri::HTML5.fragment(cooked)
|
||
|
original = doc.text.strip
|
||
|
|
||
|
# these selectors should be removed,
|
||
|
# as they are the usual culprits for incorrect detection
|
||
|
doc.css(*NECESSARY_REMOVAL_SELECTORS).remove
|
||
|
necessary = doc.text.strip
|
||
|
|
||
|
doc.css(*OPTIONAL_SELECTORS).remove
|
||
|
preferred = doc.text.strip
|
||
|
|
||
|
return preferred if preferred.present?
|
||
|
return necessary if necessary.present?
|
||
|
original
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
end
|