discourse/app/models/topic_embed.rb

require_dependency 'nokogiri'

class TopicEmbed < ActiveRecord::Base
  include Trashable

  belongs_to :topic
  belongs_to :post
  validates_presence_of :embed_url
  validates_uniqueness_of :embed_url

  before_validation(on: :create) do
    unless (topic_embed = TopicEmbed.with_deleted.where('deleted_at IS NOT NULL AND embed_url = ?', embed_url).first).nil?
      topic_embed.destroy!
    end
  end

  class FetchResponse
    attr_accessor :title, :body, :author
  end

  def self.normalize_url(url)
    url.downcase.sub(/\/$/, '').sub(/\-+/, '-').strip
  end

  def self.imported_from_html(url)
    "\n<hr>\n<small>#{I18n.t('embed.imported_from', link: "<a href='#{url}'>#{url}</a>")}</small>\n"
  end

  DOUBLE_ESCAPED_EXPR = /%25([0-9a-f]{2})/i

  # Prevents double URL encode
  # https://stackoverflow.com/a/37599235
  def self.escape_uri(uri, pattern = URI::UNSAFE)
    encoded = URI.encode(uri, pattern)
    encoded.gsub!(DOUBLE_ESCAPED_EXPR, '%\1')
    encoded
  end

  # Import an article from a source (RSS/Atom/Other)
  def self.import(user, url, title, contents)
    return unless url =~ /^https?\:\/\//

    if SiteSetting.embed_truncate
      contents = first_paragraph_from(contents)
    end
    contents ||= ''
    contents << imported_from_html(url)

    url = normalize_url(url)

    embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
    content_sha1 = Digest::SHA1.hexdigest(contents)
    post = nil

    # If there is no embed, create a topic, post and the embed.
    if embed.blank?
      Topic.transaction do
        eh = EmbeddableHost.record_for_url(url)

        creator = PostCreator.new(user,
                                  title: title,
                                  raw: absolutize_urls(url, contents),
                                  skip_validations: true,
                                  cook_method: Post.cook_methods[:raw_html],
                                  category: eh.try(:category_id))
        post = creator.create
        if post.present?
          TopicEmbed.create!(topic_id: post.topic_id,
                             embed_url: url,
                             content_sha1: content_sha1,
                             post_id: post.id)
        end
      end
    else
      absolutize_urls(url, contents)
      post = embed.post
      # Update the topic if it changed
      if post && post.topic && content_sha1 != embed.content_sha1
        post.revise(user, { raw: absolutize_urls(url, contents) }, skip_validations: true, bypass_rate_limiter: true)
        embed.update_column(:content_sha1, content_sha1)
      end
    end

    post
  end

  def self.find_remote(url)
    require 'ruby-readability'

    url = escape_uri(url)
    original_uri = URI.parse(url)
    opts = {
      tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
      attributes: %w[href src class],
      remove_empty_nodes: false
    }

    opts[:whitelist] = SiteSetting.embed_whitelist_selector if SiteSetting.embed_whitelist_selector.present?
    opts[:blacklist] = SiteSetting.embed_blacklist_selector if SiteSetting.embed_blacklist_selector.present?
    embed_classname_whitelist = SiteSetting.embed_classname_whitelist if SiteSetting.embed_classname_whitelist.present?

    response = FetchResponse.new
    begin
      html = open(url, allow_redirections: :safe).read
    rescue OpenURI::HTTPError, Net::OpenTimeout
      return
    end

    raw_doc = Nokogiri::HTML(html)
    auth_element = raw_doc.at('meta[@name="author"]')
    if auth_element.present?
      response.author = User.where(username_lower: auth_element[:content].strip).first
    end

    read_doc = Readability::Document.new(html, opts)

    title = raw_doc.title || ''
    title.strip!

    if SiteSetting.embed_title_scrubber.present?
      title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), '')
      title.strip!
    end
    response.title = title
    doc = Nokogiri::HTML(read_doc.content)

    tags = { 'img' => 'src', 'script' => 'src', 'a' => 'href' }
    doc.search(tags.keys.join(',')).each do |node|
      url_param = tags[node.name]
      src = node[url_param]
      unless (src.nil? || src.empty?)
        begin
          uri = URI.parse(escape_uri(src))
          unless uri.host
            uri.scheme = original_uri.scheme
            uri.host = original_uri.host
            node[url_param] = uri.to_s
          end
        rescue URI::InvalidURIError, URI::InvalidComponentError
          # If there is a mistyped URL, just do nothing
        end
      end
      # only allow classes in the whitelist
      allowed_classes = if embed_classname_whitelist.blank? then [] else embed_classname_whitelist.split(/[ ,]+/i) end
      doc.search('[class]:not([class=""])').each do |classnode|
        classes = classnode[:class].split(' ').select { |classname| allowed_classes.include?(classname) }
        if classes.length === 0
          classnode.delete('class')
        else
          classnode[:class] = classes.join(' ')
        end
      end
    end

    response.body = doc.to_html
    response
  end

  def self.import_remote(import_user, url, opts = nil)
    opts = opts || {}
    response = find_remote(url)
    return if response.nil?

    response.title = opts[:title] if opts[:title].present?
    import_user = response.author if response.author.present?

    TopicEmbed.import(import_user, url, response.title, response.body)
  end

  # Convert any relative URLs to absolute. RSS is annoying for this.
  def self.absolutize_urls(url, contents)
    url = normalize_url(url)
    uri = URI(escape_uri(url))
    prefix = "#{uri.scheme}://#{uri.host}"
    prefix << ":#{uri.port}" if uri.port != 80 && uri.port != 443

    fragment = Nokogiri::HTML.fragment("<div>#{contents}</div>")
    fragment.css('a').each do |a|
      href = a['href']
      if href.present? && href.start_with?('/')
        a['href'] = "#{prefix}/#{href.sub(/^\/+/, '')}"
      end
    end
    fragment.css('img').each do |a|
      src = a['src']
      if src.present? && src.start_with?('/')
        a['src'] = "#{prefix}/#{src.sub(/^\/+/, '')}"
      end
    end
    fragment.at('div').inner_html
  end

  def self.topic_id_for_embed(embed_url)
    embed_url = normalize_url(embed_url)
    TopicEmbed.where("lower(embed_url) = ?", embed_url).pluck(:topic_id).first
  end

  def self.first_paragraph_from(html)
    doc = Nokogiri::HTML(html)

    result = ""
    doc.css('p').each do |p|
      if p.text.present?
        result << p.to_s
        return result if result.size >= 100
      end
    end
    return result unless result.blank?

    # If there is no first paragaph, return the first div (onebox)
    doc.css('div').first
  end

  def self.expanded_for(post)
    Rails.cache.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
      url = TopicEmbed.where(topic_id: post.topic_id).pluck(:embed_url).first
      response = TopicEmbed.find_remote(url)

      body = response.body
      body << TopicEmbed.imported_from_html(url)
      body
    end
  end

end

# == Schema Information
#
# Table name: topic_embeds
#
#  id            :integer          not null, primary key
#  topic_id      :integer          not null
#  post_id       :integer          not null
#  embed_url     :string(1000)     not null
#  content_sha1  :string(40)
#  created_at    :datetime         not null
#  updated_at    :datetime         not null
#  deleted_at    :datetime
#  deleted_by_id :integer
#
# Indexes
#
#  index_topic_embeds_on_embed_url  (embed_url) UNIQUE
#