discourse-ai/lib/translation/post_detection_text.rb

# frozen_string_literal: true

module DiscourseAi
  module Translation
    class PostDetectionText
      NECESSARY_REMOVAL_SELECTORS = [
        ".lightbox-wrapper", # image captions
        "blockquote, aside.quote", # quotes
      ]
      OPTIONAL_SELECTORS = [
        "a.hashtag-cooked", # categories or tags are usually in site's language
        "a.mention", # mentions are based on the mentioned's user's name
        "aside.onebox", # onebox external content
        "img.emoji",
        "code, pre",
      ]

      def self.get_text(post)
        return if post.blank?
        cooked = post.cooked
        return if cooked.blank?

        doc = Nokogiri::HTML5.fragment(cooked)
        original = doc.text.strip

        # these selectors should be removed,
        # as they are the usual culprits for incorrect detection
        doc.css(*NECESSARY_REMOVAL_SELECTORS).remove
        necessary = doc.text.strip

        doc.css(*OPTIONAL_SELECTORS).remove
        preferred = doc.text.strip

        return preferred if preferred.present?
        return necessary if necessary.present?
        original
      end
    end
  end
end