discourse/app/jobs/regular/pull_hotlinked_images.rb

require_dependency 'url_helper'
require_dependency 'file_helper'
require_dependency 'upload_creator'

module Jobs

  class PullHotlinkedImages < Jobs::Base

    sidekiq_options queue: 'low'

    def initialize
      # maximum size of the file in bytes
      @max_size = SiteSetting.max_image_size_kb.kilobytes
    end

    def execute(args)
      return unless SiteSetting.download_remote_images_to_local?

      post_id = args[:post_id]
      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?

      post = Post.find_by(id: post_id)
      return unless post.present?

      raw = post.raw.dup
      start_raw = raw.dup
      downloaded_urls = {}

      extract_images_from(post.cooked).each do |image|
        src = original_src = image['src']
        src = "http:" + src if src.start_with?("//")

        if is_valid_image_url(src)
          hotlinked = nil
          begin
            # have we already downloaded that file?
            unless downloaded_urls.include?(src)
              begin
                hotlinked = FileHelper.download(src, @max_size, "discourse-hotlinked", true)
              rescue Discourse::InvalidParameters
              end
              if hotlinked
                if File.size(hotlinked.path) <= @max_size
                  filename = File.basename(URI.parse(src).path)
                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
                  downloaded_urls[src] = upload.url
                else
                  Rails.logger.info("Failed to pull hotlinked image for post: #{post_id}: #{src} - Image is bigger than #{@max_size}")
                end
              else
                Rails.logger.error("There was an error while downloading '#{src}' locally for post: #{post_id}")
              end
            end
            # have we successfully downloaded that file?
            if downloaded_urls[src].present?
              url = downloaded_urls[src]
              escaped_src = Regexp.escape(original_src)
              # there are 6 ways to insert an image in a post
              # HTML tag - <img src="http://...">
              raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")
              # BBCode tag - [img]http://...[/img]
              raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")
              # Markdown linked image - [![alt](http://...)](http://...)
              raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }
              # Markdown inline - ![alt](http://...)
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }
              # Markdown inline - ![](http://... "image title")
              raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown inline - ![alt](http://... "image title")
              raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
              # Markdown reference - [x]: http://
              raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }
              # Direct link
              raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }
            end
          rescue => e
            Rails.logger.info("Failed to pull hotlinked image: #{src} post:#{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
          ensure
            # close & delete the temp file
            hotlinked && hotlinked.close!
          end
        end

      end

      post.reload
      if start_raw == post.raw && raw != post.raw
        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
        # we never want that job to bump the topic
        options = { bypass_bump: true }
        post.revise(Discourse.system_user, changes, options)
      end
    end

    def extract_images_from(html)
      doc = Nokogiri::HTML::fragment(html)
      doc.css("img[src]") - doc.css(".onebox-result img") - doc.css("img.avatar")
    end

    def is_valid_image_url(src)
      # make sure we actually have a url
      return false unless src.present?
      # we don't want to pull uploaded images
      return false if Discourse.store.has_been_uploaded?(src)
      # we don't want to pull relative images
      return false if src =~ /\A\/[^\/]/i
      # parse the src
      begin
        uri = URI.parse(src)
      rescue URI::InvalidURIError
        return false
      end
      # we don't want to pull images hosted on the CDN (if we use one)
      return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == uri.hostname
      return false if SiteSetting.s3_cdn_url.present? && URI.parse(SiteSetting.s3_cdn_url).hostname == uri.hostname
      # we don't want to pull images hosted on the main domain
      return false if URI.parse(Discourse.base_url_no_prefix).hostname == uri.hostname
      # check the domains blacklist
      SiteSetting.should_download_images?(src)
    end

  end

end
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00			`require_dependency 'url_helper'`
FEATURE: support email attachments 2014-04-14 16:55:57 -04:00			`require_dependency 'file_helper'`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-10 18:16:57 -04:00			`require_dependency 'upload_creator'`
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00
pull hotlinked images 2013-11-05 13:04:47 -05:00			`module Jobs`

			`class PullHotlinkedImages < Jobs::Base`
FEATURE: prioritize sidekiq jobs This commit introduces 3 queues for sidekiq "critical" for urgent jobs (weighted at 4x weight) "default" for standard jobs(weighted at 2x weight) "low" for less important jobs "critical jobs" Reset Password emails has been seperated to its own job Heartbeat which is required to keep sidekiq running Test email which needs to return real quick "low priority jobs" Notify mailing list Pull hotlinked images Update gravatar "default" All the rest Note: for people running sidekiq from command line use bin/sidekiq -q critical,4 -q default,2 -q low 2016-04-06 22:56:43 -04:00
			`sidekiq_options queue: 'low'`

pull hotlinked images 2013-11-05 13:04:47 -05:00			`def initialize`
			`# maximum size of the file in bytes`
do not pull hotlinked images when max_image_size_kb == 0 2013-11-13 11:30:48 -05:00			`@max_size = SiteSetting.max_image_size_kb.kilobytes`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

			`def execute(args)`
add download_remote_images_to_local site setting 2013-11-15 09:22:18 -05:00			`return unless SiteSetting.download_remote_images_to_local?`
pull hotlinked images 2013-11-05 13:04:47 -05:00
			`post_id = args[:post_id]`
			`raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?`

Perform the where(...).first to find_by(...) refactoring. This refactoring was automated using the command: bundle exec "ruby refactorings/where_dot_first_to_find_by/app.rb" 2014-05-06 09:41:59 -04:00			`post = Post.find_by(id: post_id)`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`return unless post.present?`

			`raw = post.raw.dup`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`start_raw = raw.dup`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`downloaded_urls = {}`

			`extract_images_from(post.cooked).each do \|image\|`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 05:50:07 -05:00			`src = original_src = image['src']`
make sure image urls have a scheme before pulling them in 2013-11-25 13:47:53 -05:00			`src = "http:" + src if src.start_with?("//")`
pull hotlinked images 2013-11-05 13:04:47 -05:00
			`if is_valid_image_url(src)`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`hotlinked = nil`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`begin`
			`# have we already downloaded that file?`
Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`unless downloaded_urls.include?(src)`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 09:32:48 -04:00			`begin`
FIX: follow redirects when pulling hotlinked images 2015-08-14 06:46:52 -04:00			`hotlinked = FileHelper.download(src, @max_size, "discourse-hotlinked", true)`
BUGFIX: pull hotlinked images job wasn't properly handling the InvalidParameters exception 2014-04-22 09:32:48 -04:00			`rescue Discourse::InvalidParameters`
			`end`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`if hotlinked`
FIX: Use File.size instead of IO.size 2015-08-17 12:57:28 -04:00			`if File.size(hotlinked.path) <= @max_size`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`filename = File.basename(URI.parse(src).path)`
REFACTOR: upload workflow creation into UploadCreator - Automatically convert large-ish PNG/BMP to JPEG - Updated fast_image to latest version 2017-05-10 18:16:57 -04:00			`upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`downloaded_urls[src] = upload.url`
			`else`
lower the volume on failed to pull hotlinked image add more diagnostics 2015-08-18 22:32:31 -04:00			`Rails.logger.info("Failed to pull hotlinked image for post: #{post_id}: #{src} - Image is bigger than #{@max_size}")`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`end`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`else`
lower the volume on failed to pull hotlinked image add more diagnostics 2015-08-18 22:32:31 -04:00			`Rails.logger.error("There was an error while downloading '#{src}' locally for post: #{post_id}")`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`end`
Correct few spelling in the comments 2013-12-21 02:19:22 -05:00			`# have we successfully downloaded that file?`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`if downloaded_urls[src].present?`
			`url = downloaded_urls[src]`
FIX: Handle img src starting with "//" in pull_hotlinked_images job 2017-01-16 05:50:07 -05:00			`escaped_src = Regexp.escape(original_src)`
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00			`# there are 6 ways to insert an image in a post`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# HTML tag - <img src="http://...">`
			`raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'")`
			`# BBCode tag - [img]http://...[/img]`
			`raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]")`
FIX markdown hotlinked images were not properly pulled 2013-11-20 07:10:08 -05:00			`# Markdown linked image - [![alt](http://...)](http://...)`
			`raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# Markdown inline - ![alt](http://...)`
			`raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }`
FIX: properly insert images in markdown inline format 2016-09-01 02:25:40 -04:00			`# Markdown inline - ![](http://... "image title")`
			`raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }`
FIX: properly insert images in markdown inline format (take 2) 2016-09-01 08:26:39 -04:00			`# Markdown inline - ![alt](http://... "image title")`
			`raw.gsub!(/!\[([^\]])\]\(#{escaped_src} "([^\]])"\)/) { "![](#{url})" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# Markdown reference - [x]: http://`
FIX: PullHotlinkedImages was messing with URL when using Markdown references 2015-11-09 10:37:51 -05:00			`raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`# Direct link`
FIX: keep whitespaces when replacing direct link to external images with local images 2016-02-15 06:34:45 -05:00			`raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" }`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`rescue => e`
lower the volume on failed to pull hotlinked image add more diagnostics 2015-08-18 22:32:31 -04:00			`Rails.logger.info("Failed to pull hotlinked image: #{src} post:#{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`ensure`
			`# close & delete the temp file`
			`hotlinked && hotlinked.close!`
			`end`
			`end`

			`end`

Backoff-retry for hotlinked image pull + some style fixes 2014-04-21 17:08:17 -04:00			`post.reload`
FIX: prevent infinite loop in PullHotlinkedImages job 2015-10-30 17:46:46 -04:00			`if start_raw == post.raw && raw != post.raw`
LOTS of changes to properly handle post/topic revisions FIX: history revision can now properly be hidden FIX: PostRevision serializer is now entirely dynamic to properly handle hidden revisions FIX: default history modal to "side by side" view on mobile FIX: properly hiden which revision has been hidden UX: inline category/user/wiki/post_type changes with the revision details FEATURE: new '/posts/:post_id/revisions/latest' endpoint to retrieve latest revision UX: do not show the hide/show revision button on mobile (no room for them) UX: remove CSS transitions on the buttons in the history modal FIX: PostRevisor now handles all the changes that might create new revisions FIX: PostRevision.ensure_consistency! was wrong due to off by 1 mistake... refactored topic's callbacks for better readability extracted 'PostRevisionGuardian' 2014-10-27 17:06:43 -04:00			`changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }`
			`# we never want that job to bump the topic`
			`options = { bypass_bump: true }`
			`post.revise(Discourse.system_user, changes, options)`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`
			`end`

			`def extract_images_from(html)`
			`doc = Nokogiri::HTML::fragment(html)`
BUGFIX: errors when post-processing 'data images' 2014-07-18 11:54:18 -04:00			`doc.css("img[src]") - doc.css(".onebox-result img") - doc.css("img.avatar")`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

			`def is_valid_image_url(src)`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# make sure we actually have a url`
			`return false unless src.present?`
			`# we don't want to pull uploaded images`
			`return false if Discourse.store.has_been_uploaded?(src)`
FIX: there's no need to try to download relative images 2014-09-26 12:27:10 -04:00			`# we don't want to pull relative images`
			`return false if src =~ /\A\/[^\/]/i`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# parse the src`
			`begin`
			`uri = URI.parse(src)`
			`rescue URI::InvalidURIError`
			`return false`
			`end`
			`# we don't want to pull images hosted on the CDN (if we use one)`
			`return false if Discourse.asset_host.present? && URI.parse(Discourse.asset_host).hostname == uri.hostname`
should be using site setting not global 2015-05-26 21:17:46 -04:00			`return false if SiteSetting.s3_cdn_url.present? && URI.parse(SiteSetting.s3_cdn_url).hostname == uri.hostname`
BUGFIX: make sure we do not try to pull images from the CDN 2014-05-07 13:49:16 -04:00			`# we don't want to pull images hosted on the main domain`
			`return false if URI.parse(Discourse.base_url_no_prefix).hostname == uri.hostname`
			`# check the domains blacklist`
Add site setting for domains to never download images from 2014-04-21 16:59:53 -04:00			`SiteSetting.should_download_images?(src)`
pull hotlinked images 2013-11-05 13:04:47 -05:00			`end`

			`end`

			`end`