discourse/app/jobs/regular/pull_hotlinked_images.rb

require_dependency 'url_helper'
require_dependency 'file_helper'
require_dependency 'upload_creator'

module Jobs

  class PullHotlinkedImages < Jobs::Base

    sidekiq_options queue: 'low'

    def initialize
      # maximum size of the file in bytes
      @max_size = SiteSetting.max_image_size_kb.kilobytes
    end

    def execute(args)
      return unless SiteSetting.download_remote_images_to_local?

      post_id = args[:post_id]
      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?

      post = Post.find_by(id: post_id)
      return unless post.present?

      raw = post.raw.dup
      start_raw = raw.dup
      downloaded_urls = {}

      extract_images_from(post.cooked).each do |image|
        src = original_src = image['src']
        src = "http:#{src}" if src.start_with?("//")

        if is_valid_image_url(src)
          hotlinked = nil
          begin
            # have we already downloaded that file?
            unless downloaded_urls.include?(src)
              begin
                hotlinked = FileHelper.download(
                  src,
                  max_file_size: @max_size,
                  tmp_file_name: "discourse-hotlinked",
                  follow_redirect: true
                )
              rescue Discourse::InvalidParameters
              end
              if hotlinked
                if File.size(hotlinked.path) <= @max_size
                  filename = File.basename(URI.parse(src).path)
                  filename << File.extname(hotlinked.path) unless filename["."]
                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
                  if upload.persisted?
                    downloaded_urls[src] = upload.url
                  else
                    log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.join("\n")}")
                  end
                else
                  log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - Image is bigger than #{@max_size}")
                end
              else
                log(:info, "There was an error while downloading '#{src}' locally for post: #{post_id}")
              end
            end
            # have we successfully downloaded that file?
            if downloaded_urls[src].present?
              url = downloaded_urls[src]
              escaped_src = Regexp.escape(original_src)
              # there are 6 ways to insert an image in a post
              # HTML tag - <img src="http://...">
              raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")
              # BBCode tag - [img]http://...[/img]
              raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")
              # Markdown linked image - [![alt](http://...)](http://...)
              raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }
              # Markdown inline - ![alt](http://...)
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }
              # Markdown inline - ![](http://... "image title")
              raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown inline - ![alt](http://... "image title")
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown reference - [x]: http://
              raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }
              # Direct link
              raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }
            end
          rescue => e
            log(:info, "Failed to pull hotlinked image: #{src} post:#{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
          end
        end

      end

      post.reload
      if start_raw == post.raw && raw != post.raw
        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
        # we never want that job to bump the topic
        options = { bypass_bump: true }
        post.revise(Discourse.system_user, changes, options)
      elsif downloaded_urls.present?
        post.trigger_post_process(true)
      end
    end

    def extract_images_from(html)
      doc = Nokogiri::HTML::fragment(html)
      doc.css("img[src]") - doc.css("img.avatar")
    end

    def is_valid_image_url(src)
      # make sure we actually have a url
      return false unless src.present?
      # we don't want to pull uploaded images
      return false if Discourse.store.has_been_uploaded?(src)
      # we don't want to pull relative images
      return false if src =~ /\A\/[^\/]/i

      # parse the src
      begin
        uri = URI.parse(src)
      rescue URI::InvalidURIError
        return false
      end

      hostname = uri.hostname
      return false unless hostname

      # we don't want to pull images hosted on the CDN (if we use one)
      return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == hostname
      return false if SiteSetting.s3_cdn_url.present? && URI.parse(SiteSetting.s3_cdn_url).hostname == hostname
      # we don't want to pull images hosted on the main domain
      return false if URI.parse(Discourse.base_url_no_prefix).hostname == hostname
      # check the domains blacklist
      SiteSetting.should_download_images?(src)
    end

    def log(log_level, message)
      Rails.logger.public_send(
        log_level,
        "#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
      )
    end

  end

end
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00			`require_dependency 'url_helper'`
FEATURE: support email attachments 2014-04-14 16:55:57 -04:00			`require_dependency 'file_helper'`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-10 18:16:57 -04:00			`require_dependency 'upload_creator'`
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00
pull hotlinked images 2013-11-05 13:04:47 -05:00			`module Jobs`

			`class PullHotlinkedImages < Jobs::Base`
FEATURE: prioritize sidekiq jobs This commit introduces 3 queues for sidekiq "critical" for urgent jobs (weighted at 4x weight) "default" for standard jobs(weighted at 2x weight) "low" for less important jobs "critical jobs" Reset Password emails has been seperated to its own job Heartbeat which is required to keep sidekiq running Test email which needs to return real quick "low priority jobs" Notify mailing list Pull hotlinked images Update gravatar "default" All the rest Note: for people running sidekiq from command line use bin/sidekiq -q critical,4 -q default,2 -q low 2016-04-06 22:56:43 -04:00
			`sidekiq_options queue: 'low'`

pull hotlinked images 2013-11-05 13:04:47 -05:00			`def initialize`
			`# maximum size of the file in bytes`
do not pull hotlinked images when max_image_size_kb == 0 2013-11-13 11:30:48 -05:00			`@max_size = SiteSetting.max_image_size_kb.kilobytes`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

			`def execute(args)`
add download_remote_images_to_local site setting 2013-11-15 09:22:18 -05:00			`return unless SiteSetting.download_remote_images_to_local?`
pull hotlinked images 2013-11-05 13:04:47 -05:00
			`post_id = args[:post_id]`
			`raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?`

Perform the where(...).first to find_by(...) refactoring. This refactoring was automated using the command: bundle exec "ruby refactorings/where_dot_first_to_find_by/app.rb" 2014-05-06 09:41:59 -04:00			`post = Post.find_by(id: post_id)`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`return unless post.present?`

			`raw = post.raw.dup`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`start_raw = raw.dup`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`downloaded_urls = {}`

			`extract_images_from(post.cooked).each do \|image\|`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 05:50:07 -05:00			`src = original_src = image['src']`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 04:55:28 -04:00			`src = "http:#{src}" if src.start_with?("//")`
pull hotlinked images 2013-11-05 13:04:47 -05:00
			`if is_valid_image_url(src)`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`hotlinked = nil`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`begin`
			`# have we already downloaded that file?`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`unless downloaded_urls.include?(src)`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 09:32:48 -04:00			`begin`
Refactor `FileHelper` to use keyword arguments. 2017-05-24 13:42:52 -04:00			`hotlinked = FileHelper.download(`
			`src,`
			`max_file_size: @max_size,`
			`tmp_file_name: "discourse-hotlinked",`
			`follow_redirect: true`
			`)`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 09:32:48 -04:00			`rescue Discourse::InvalidParameters`
			`end`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`if hotlinked`
FIX: Use File.size instead of IO.size 2015-08-17 12:57:28 -04:00			`if File.size(hotlinked.path) <= @max_size`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`filename = File.basename(URI.parse(src).path)`
FIX: pull hotlinked images even when they have no extension 2017-06-13 07:27:05 -04:00			`filename << File.extname(hotlinked.path) unless filename["."]`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-10 18:16:57 -04:00			`upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)`
FIX: pull hotlinked images even when they have no extension 2017-06-13 07:27:05 -04:00			`if upload.persisted?`
			`downloaded_urls[src] = upload.url`
			`else`
Log site name when logging to Logster in `Jobs::PullHotlinkedImages`. 2017-07-04 21:34:24 -04:00			`log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.join("\n")}")`
FIX: pull hotlinked images even when they have no extension 2017-06-13 07:27:05 -04:00			`end`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`else`
Log site name when logging to Logster in `Jobs::PullHotlinkedImages`. 2017-07-04 21:34:24 -04:00			`log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - Image is bigger than #{@max_size}")`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`end`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`else`
change log level to info when failing to download a hotlinked image 2017-07-12 05:06:28 -04:00			`log(:info, "There was an error while downloading '#{src}' locally for post: #{post_id}")`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`end`
Correct few spelling in the comments 2013-12-21 02:19:22 -05:00			`# have we successfully downloaded that file?`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`if downloaded_urls[src].present?`
			`url = downloaded_urls[src]`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 05:50:07 -05:00			`escaped_src = Regexp.escape(original_src)`
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00			`# there are 6 ways to insert an image in a post`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# HTML tag - <img src="http://...">`
			`raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")`
			`# BBCode tag - [img]http://...[/img]`
			`raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")`
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00			`# Markdown linked image - [![alt](http://...)](http://...)`
			`raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# Markdown inline - ![alt](http://...)`
			`raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }`
FIX: properly insert images in markdown inline format 2016-09-01 02:25:40 -04:00			`# Markdown inline - ![](http://... "image title")`
			`raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }`
FIX: properly insert images in markdown inline format (take 2) 2016-09-01 08:26:39 -04:00			`# Markdown inline - ![alt](http://... "image title")`
			`raw.gsub!(/!\[([^\]])\]\(#{escaped_src} "([^\]])"\)/) { "![](#{url})" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# Markdown reference - [x]: http://`
FIX: PullHotlinkedImages was messing with URL when using Markdown references 2015-11-09 10:37:51 -05:00			`raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# Direct link`
FIX: keep whitespaces when replacing direct link to external images with local images 2016-02-15 06:34:45 -05:00			`raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`rescue => e`
Log site name when logging to Logster in `Jobs::PullHotlinkedImages`. 2017-07-04 21:34:24 -04:00			`log(:info, "Failed to pull hotlinked image: #{src} post:#{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`end`

			`end`

Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`post.reload`
FIX: prevent infinite loop in PullHotlinkedImages job 2015-10-30 17:46:46 -04:00			`if start_raw == post.raw && raw != post.raw`
LOTS of changes to properly handle post/topic revisions FIX: history revision can now properly be hidden FIX: PostRevision serializer is now entirely dynamic to properly handle hidden revisions FIX: default history modal to "side by side" view on mobile FIX: properly hiden which revision has been hidden UX: inline category/user/wiki/post_type changes with the revision details FEATURE: new '/posts/:post_id/revisions/latest' endpoint to retrieve latest revision UX: do not show the hide/show revision button on mobile (no room for them) UX: remove CSS transitions on the buttons in the history modal FIX: PostRevisor now handles all the changes that might create new revisions FIX: PostRevision.ensure_consistency! was wrong due to off by 1 mistake... refactored topic's callbacks for better readability extracted 'PostRevisionGuardian' 2014-10-27 17:06:43 -04:00			`changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }`
			`# we never want that job to bump the topic`
			`options = { bypass_bump: true }`
			`post.revise(Discourse.system_user, changes, options)`
FEATURE: pull onebox images 2017-06-02 05:39:06 -04:00			`elsif downloaded_urls.present?`
			`post.trigger_post_process(true)`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`end`

			`def extract_images_from(html)`
			`doc = Nokogiri::HTML::fragment(html)`
FEATURE: pull onebox images 2017-06-02 05:39:06 -04:00			`doc.css("img[src]") - doc.css("img.avatar")`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

			`def is_valid_image_url(src)`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# make sure we actually have a url`
			`return false unless src.present?`
			`# we don't want to pull uploaded images`
			`return false if Discourse.store.has_been_uploaded?(src)`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`# we don't want to pull relative images`
			`return false if src =~ /\A\/[^\/]/i`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 04:55:28 -04:00
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# parse the src`
			`begin`
			`uri = URI.parse(src)`
			`rescue URI::InvalidURIError`
			`return false`
			`end`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 04:55:28 -04:00
			`hostname = uri.hostname`
			`return false unless hostname`

BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# we don't want to pull images hosted on the CDN (if we use one)`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 04:55:28 -04:00			`return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == hostname`
			`return false if SiteSetting.s3_cdn_url.present? && URI.parse(SiteSetting.s3_cdn_url).hostname == hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# we don't want to pull images hosted on the main domain`
FIX: `Jobs::PullHotlinkedImages#is_valid_image_src` returns true for a generic string. 2017-07-06 04:55:28 -04:00			`return false if URI.parse(Discourse.base_url_no_prefix).hostname == hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# check the domains blacklist`
Add site setting for domains to never download images from 2014-04-21 16:59:53 -04:00			`SiteSetting.should_download_images?(src)`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

Log site name when logging to Logster in `Jobs::PullHotlinkedImages`. 2017-07-04 21:34:24 -04:00			`def log(log_level, message)`
			`Rails.logger.public_send(`
			`log_level,`
			`"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"`
			`)`
			`end`

pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

			`end`