2019-05-02 18:17:27 -04:00
# frozen_string_literal: true
2016-05-19 08:25:08 -04:00
require 'mini_racer'
2013-02-05 14:16:51 -05:00
require 'nokogiri'
2016-06-14 14:31:51 -04:00
require 'erb'
2013-02-05 14:16:51 -05:00
module PrettyText
2021-11-21 19:43:03 -05:00
DANGEROUS_BIDI_CHARACTERS = [
" \ u202A " ,
" \ u202B " ,
" \ u202C " ,
" \ u202D " ,
" \ u202E " ,
" \ u2066 " ,
" \ u2067 " ,
" \ u2068 " ,
" \ u2069 " ,
] . freeze
DANGEROUS_BIDI_REGEXP = Regexp . new ( DANGEROUS_BIDI_CHARACTERS . join ( " | " ) ) . freeze
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 10:23:04 -04:00
BLOCKED_HOTLINKED_SRC_ATTR = " data-blocked-hotlinked-src "
BLOCKED_HOTLINKED_SRCSET_ATTR = " data-blocked-hotlinked-srcset "
2016-06-14 14:31:51 -04:00
@mutex = Mutex . new
@ctx_init = Mutex . new
2013-02-05 14:16:51 -05:00
2016-06-14 14:31:51 -04:00
def self . app_root
Rails . root
end
2013-07-16 03:48:48 -04:00
2016-06-14 14:31:51 -04:00
def self . find_file ( root , filename )
return filename if File . file? ( " #{ root } #{ filename } " )
2015-03-12 15:51:28 -04:00
2016-06-14 14:31:51 -04:00
es6_name = " #{ filename } .js.es6 "
return es6_name if File . file? ( " #{ root } #{ es6_name } " )
2013-02-05 14:16:51 -05:00
2016-06-14 14:31:51 -04:00
js_name = " #{ filename } .js "
return js_name if File . file? ( " #{ root } #{ js_name } " )
2015-09-24 23:35:14 -04:00
2016-06-14 14:31:51 -04:00
erb_name = " #{ filename } .js.es6.erb "
return erb_name if File . file? ( " #{ root } #{ erb_name } " )
2020-03-20 09:55:42 -04:00
2016-06-14 14:31:51 -04:00
erb_name = " #{ filename } .js.erb "
return erb_name if File . file? ( " #{ root } #{ erb_name } " )
end
2015-12-28 01:28:16 -05:00
2016-06-14 14:31:51 -04:00
def self . apply_es6_file ( ctx , root_path , part_name )
filename = find_file ( root_path , part_name )
if filename
source = File . read ( " #{ root_path } #{ filename } " )
2020-03-11 09:43:55 -04:00
source = ERB . new ( source ) . result ( binding ) if filename =~ / \ .erb$ /
2016-04-25 15:55:15 -04:00
2020-03-11 09:43:55 -04:00
transpiler = DiscourseJsProcessor :: Transpiler . new
transpiled = transpiler . perform ( source , " #{ Rails . root } /app/assets/javascripts/ " , part_name )
2016-06-14 14:31:51 -04:00
ctx . eval ( transpiled )
else
# Look for vendored stuff
vendor_root = " #{ Rails . root } /vendor/assets/javascripts/ "
filename = find_file ( vendor_root , part_name )
if filename
ctx . eval ( File . read ( " #{ vendor_root } #{ filename } " ) )
2016-04-25 15:55:15 -04:00
end
end
2013-02-05 14:16:51 -05:00
end
2017-06-08 18:02:30 -04:00
def self . ctx_load_manifest ( ctx , name )
manifest = File . read ( " #{ Rails . root } /app/assets/javascripts/ #{ name } " )
2016-06-14 14:31:51 -04:00
root_path = " #{ Rails . root } /app/assets/javascripts/ "
2017-06-08 18:02:30 -04:00
2016-06-14 14:31:51 -04:00
manifest . each_line do | l |
2016-08-25 07:45:29 -04:00
l = l . chomp
2016-06-14 14:31:51 -04:00
if l =~ / \/ \/ = require ( \ . \/ )?(.*)$ /
apply_es6_file ( ctx , root_path , Regexp . last_match [ 2 ] )
elsif l =~ / \/ \/ = require_tree ( \ . \/ )?(.*)$ /
path = Regexp . last_match [ 2 ]
2016-08-10 13:20:39 -04:00
Dir [ " #{ root_path } / #{ path } /** " ] . sort . each do | f |
2016-06-14 14:31:51 -04:00
apply_es6_file ( ctx , root_path , f . sub ( root_path , '' ) [ 1 .. - 1 ] . sub ( / \ .js(.es6)?$ / , '' ) )
end
end
2016-05-19 08:25:08 -04:00
end
2017-06-08 18:02:30 -04:00
end
def self . create_es6_context
2020-05-15 00:01:54 -04:00
ctx = MiniRacer :: Context . new ( timeout : 25000 , ensure_gc_after_idle : 2000 )
2017-06-08 18:02:30 -04:00
ctx . eval ( " window = {}; window.devicePixelRatio = 2; " ) # hack to make code think stuff is retina
2022-02-11 12:16:27 -05:00
ctx . attach ( " rails.logger.info " , proc { | err | Rails . logger . info ( err . to_s ) } )
ctx . attach ( " rails.logger.warn " , proc { | err | Rails . logger . warn ( err . to_s ) } )
ctx . attach ( " rails.logger.error " , proc { | err | Rails . logger . error ( err . to_s ) } )
ctx . eval << ~ JS
console = {
prefix : " [PrettyText] " ,
log : function ( ... args ) { rails . logger . info ( console . prefix + args . join ( " " ) ) ; } ,
warn : function ( ... args ) { rails . logger . warn ( console . prefix + args . join ( " " ) ) ; } ,
error : function ( ... args ) { rails . logger . error ( console . prefix + args . join ( " " ) ) ; }
}
JS
2019-10-30 09:48:24 -04:00
ctx . eval ( " __PRETTY_TEXT = true " )
2017-06-08 18:02:30 -04:00
2020-05-13 16:23:41 -04:00
PrettyText :: Helpers . instance_methods . each do | method |
ctx . attach ( " __helpers. #{ method } " , PrettyText :: Helpers . method ( method ) )
end
2022-06-20 12:40:25 -04:00
ctx_load ( ctx , " #{ Rails . root } /app/assets/javascripts/mini-loader.js " )
2020-04-30 16:41:02 -04:00
ctx_load ( ctx , " #{ Rails . root } /app/assets/javascripts/handlebars-shim.js " )
2022-06-23 22:28:05 -04:00
ctx_load ( ctx , " #{ Rails . root } /app/assets/javascripts/node_modules/xss/dist/xss.min.js " )
2020-09-15 10:59:41 -04:00
ctx . load ( " #{ Rails . root } /lib/pretty_text/vendor-shims.js " )
2017-06-08 18:02:30 -04:00
ctx_load_manifest ( ctx , " pretty-text-bundle.js " )
2017-07-14 08:27:28 -04:00
ctx_load_manifest ( ctx , " markdown-it-bundle.js " )
2017-06-08 18:02:30 -04:00
root_path = " #{ Rails . root } /app/assets/javascripts/ "
2013-02-05 14:16:51 -05:00
2020-06-03 12:45:26 -04:00
apply_es6_file ( ctx , root_path , " discourse-common/addon/lib/get-url " )
2020-09-02 11:52:54 -04:00
apply_es6_file ( ctx , root_path , " discourse-common/addon/lib/object " )
2020-10-27 22:22:06 -04:00
apply_es6_file ( ctx , root_path , " discourse-common/addon/lib/deprecated " )
2021-03-17 09:11:40 -04:00
apply_es6_file ( ctx , root_path , " discourse-common/addon/lib/escape " )
2022-08-02 04:06:03 -04:00
apply_es6_file ( ctx , root_path , " discourse-common/addon/utils/watched-words " )
2018-05-05 05:21:07 -04:00
apply_es6_file ( ctx , root_path , " discourse/app/lib/to-markdown " )
2016-06-14 14:31:51 -04:00
apply_es6_file ( ctx , root_path , " discourse/app/lib/utilities " )
ctx . load ( " #{ Rails . root } /lib/pretty_text/shims.js " )
ctx . eval ( " __setUnicode( #{ Emoji . unicode_replacements_json } ) " )
2013-08-08 18:14:12 -04:00
2016-06-14 14:31:51 -04:00
to_load = [ ]
DiscoursePluginRegistry . each_globbed_asset do | a |
to_load << a if File . file? ( a ) && a =~ / discourse-markdown /
end
to_load . uniq . each do | f |
if f =~ / ^.+assets \/ javascripts \/ /
root = Regexp . last_match [ 0 ]
apply_es6_file ( ctx , root , f . sub ( root , '' ) . sub ( / \ .js( \ .es6)?$ / , '' ) )
2013-02-05 14:16:51 -05:00
end
end
2018-04-10 02:37:16 -04:00
DiscoursePluginRegistry . vendored_core_pretty_text . each do | vpt |
ctx . eval ( File . read ( vpt ) )
end
2017-04-18 17:49:56 -04:00
DiscoursePluginRegistry . vendored_pretty_text . each do | vpt |
ctx . eval ( File . read ( vpt ) )
end
2013-08-15 18:12:10 -04:00
ctx
end
def self . v8
return @ctx if @ctx
# ensure we only init one of these
@ctx_init . synchronize do
return @ctx if @ctx
2016-06-14 14:31:51 -04:00
@ctx = create_es6_context
2013-08-15 18:12:10 -04:00
end
2014-04-14 16:55:57 -04:00
2013-02-05 14:16:51 -05:00
@ctx
end
2020-05-27 14:11:52 -04:00
def self . reset_translations
v8 . eval ( " __resetTranslationTree() " )
end
2014-11-14 01:51:04 -05:00
def self . reset_context
@ctx_init . synchronize do
2017-07-20 00:17:45 -04:00
@ctx & . dispose
2014-11-14 01:51:04 -05:00
@ctx = nil
end
end
2022-01-06 02:27:12 -05:00
# Acceptable options:
#
# disable_emojis - Disables the emoji markdown engine.
# features - A hash where the key is the markdown feature name and the value is a boolean to enable/disable the markdown feature.
# The hash is merged into the default features set in pretty-text.js which can be used to add new features or disable existing features.
# features_override - An array of markdown feature names to override the default markdown feature set. Currently used by plugins to customize what features should be enabled
# when rendering markdown.
# markdown_it_rules - An array of markdown rule names which will be applied to the markdown-it engine. Currently used by plugins to customize what markdown-it rules should be
# enabled when rendering markdown.
# topic_id - Topic id for the post being cooked.
# user_id - User id for the post being cooked.
2022-02-23 01:13:46 -05:00
# force_quote_link - Always create the link to the quoted topic for [quote] bbcode. Normally this only happens
# if the topic_id provided is different from the [quote topic:X].
2016-07-07 03:52:56 -04:00
def self . markdown ( text , opts = { } )
2013-02-05 14:16:51 -05:00
# we use the exact same markdown converter as the client
2013-02-25 11:42:20 -05:00
# TODO: use the same extensions on both client and server (in particular the template for mentions)
2013-02-05 14:16:51 -05:00
baked = nil
2016-05-19 08:25:08 -04:00
text = text || " "
2013-02-05 14:16:51 -05:00
2014-02-03 19:12:53 -05:00
protect do
2013-08-15 23:03:47 -04:00
context = v8
2013-10-11 16:24:27 -04:00
2016-06-14 14:31:51 -04:00
custom_emoji = { }
2016-11-17 13:35:39 -05:00
Emoji . custom . map { | e | custom_emoji [ e . name ] = e . url }
2016-06-14 14:31:51 -04:00
2022-02-23 01:13:46 -05:00
# note, any additional options added to __optInput here must be
# also be added to the buildOptions function in pretty-text.js,
# otherwise they will be discarded
2019-05-02 18:17:27 -04:00
buffer = + << ~ JS
2017-06-08 18:02:30 -04:00
__optInput = { } ;
__optInput . siteSettings = #{SiteSetting.client_settings_json};
2019-06-03 03:41:26 -04:00
#{"__optInput.disableEmojis = true" if opts[:disable_emojis]}
2018-05-23 10:47:09 -04:00
__paths = #{paths_json};
2017-06-08 18:02:30 -04:00
__optInput . getURL = __getURL ;
2019-07-09 07:42:02 -04:00
#{"__optInput.features = #{opts[:features].to_json};" if opts[:features]}
2022-01-06 02:27:12 -05:00
#{"__optInput.featuresOverride = #{opts[:features_override].to_json};" if opts[:features_override]}
#{"__optInput.markdownItRules = #{opts[:markdown_it_rules].to_json};" if opts[:markdown_it_rules]}
2017-06-08 18:02:30 -04:00
__optInput . getCurrentUser = __getCurrentUser ;
__optInput . lookupAvatar = __lookupAvatar ;
2017-11-03 09:51:40 -04:00
__optInput . lookupPrimaryUserGroup = __lookupPrimaryUserGroup ;
2017-11-20 16:28:03 -05:00
__optInput . formatUsername = __formatUsername ;
2017-06-08 18:02:30 -04:00
__optInput . getTopicInfo = __getTopicInfo ;
__optInput . categoryHashtagLookup = __categoryLookup ;
__optInput . customEmoji = #{custom_emoji.to_json};
2020-05-27 14:11:52 -04:00
__optInput . customEmojiTranslation = #{Plugin::CustomEmoji.translations.to_json};
2017-06-28 13:47:22 -04:00
__optInput . emojiUnicodeReplacer = __emojiUnicodeReplacer ;
2019-05-28 21:00:25 -04:00
__optInput . lookupUploadUrls = __lookupUploadUrls ;
2022-08-02 04:06:03 -04:00
__optInput . censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json };
2021-06-02 01:36:49 -04:00
__optInput . watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json};
__optInput . watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json};
2022-01-27 22:02:02 -05:00
__optInput . additionalOptions = #{Site.markdown_additional_options.to_json};
2017-06-08 18:02:30 -04:00
JS
2022-01-06 02:27:12 -05:00
if opts [ :topic_id ]
buffer << " __optInput.topicId = #{ opts [ :topic_id ] . to_i } ; \n "
2017-06-08 18:02:30 -04:00
end
2022-02-23 01:13:46 -05:00
if opts [ :force_quote_link ]
buffer << " __optInput.forceQuoteLink = #{ opts [ :force_quote_link ] } ; \n "
end
2017-06-08 18:02:30 -04:00
if opts [ :user_id ]
buffer << " __optInput.userId = #{ opts [ :user_id ] . to_i } ; \n "
end
buffer << " __textOptions = __buildOptions(__optInput); \n "
2017-07-14 08:27:28 -04:00
buffer << ( " __pt = new __PrettyText(__textOptions); " )
2016-08-11 14:59:20 -04:00
# Be careful disabling sanitization. We allow for custom emails
if opts [ :sanitize ] == false
2017-07-14 08:27:28 -04:00
buffer << ( '__pt.disableSanitizer();' )
2016-08-11 14:59:20 -04:00
end
2017-06-08 18:02:30 -04:00
opts = context . eval ( buffer )
2016-06-14 14:31:51 -04:00
2016-01-29 09:59:15 -05:00
DiscourseEvent . trigger ( :markdown_context , context )
2016-06-14 14:31:51 -04:00
baked = context . eval ( " __pt.cook( #{ text . inspect } ) " )
2013-02-05 14:16:51 -05:00
end
baked
end
2018-05-23 10:47:09 -04:00
def self . paths_json
paths = {
2020-10-09 07:51:24 -04:00
baseUri : Discourse . base_path ,
2018-05-23 10:47:09 -04:00
CDN : Rails . configuration . action_controller . asset_host ,
}
if SiteSetting . Upload . enable_s3_uploads
if SiteSetting . Upload . s3_cdn_url . present?
paths [ :S3CDN ] = SiteSetting . Upload . s3_cdn_url
end
paths [ :S3BaseUrl ] = Discourse . store . absolute_base_url
end
paths . to_json
end
2013-02-05 14:16:51 -05:00
# leaving this here, cause it invokes v8, don't want to implement twice
2013-08-13 16:08:29 -04:00
def self . avatar_img ( avatar_template , size )
2014-02-03 19:12:53 -05:00
protect do
2018-05-23 10:47:09 -04:00
v8 . eval ( << ~ JS )
__paths = #{paths_json};
__utils . avatarImg ( { size : #{size.inspect}, avatarTemplate: #{avatar_template.inspect}}, __getURL);
JS
2013-02-05 14:16:51 -05:00
end
end
2015-10-15 03:59:29 -04:00
def self . unescape_emoji ( title )
2019-03-21 04:11:33 -04:00
return title unless SiteSetting . enable_emoji? && title
2016-06-14 14:31:51 -04:00
set = SiteSetting . emoji_set . inspect
2019-03-06 06:49:17 -05:00
custom = Emoji . custom . map { | e | [ e . name , e . url ] } . to_h . to_json
2019-12-03 11:32:33 -05:00
2015-10-15 03:59:29 -04:00
protect do
2018-05-23 10:47:09 -04:00
v8 . eval ( << ~ JS )
__paths = #{paths_json};
2019-05-21 10:56:51 -04:00
__performEmojiUnescape ( #{title.inspect}, {
getURL : __getURL ,
emojiSet : #{set},
2021-03-03 15:39:00 -05:00
emojiCDNUrl : " #{ SiteSetting . external_emoji_url . blank? ? " " : SiteSetting . external_emoji_url } " ,
2019-05-21 10:56:51 -04:00
customEmoji : #{custom},
2019-12-03 11:32:33 -05:00
enableEmojiShortcuts : #{SiteSetting.enable_emoji_shortcuts},
inlineEmoji : #{SiteSetting.enable_inline_emoji_translation}
2019-05-21 10:56:51 -04:00
} ) ;
2018-05-23 10:47:09 -04:00
JS
2015-10-15 03:59:29 -04:00
end
end
2019-03-21 04:11:33 -04:00
def self . escape_emoji ( title )
return unless title
2019-05-21 10:56:51 -04:00
replace_emoji_shortcuts = SiteSetting . enable_emoji && SiteSetting . enable_emoji_shortcuts
2019-03-21 04:11:33 -04:00
protect do
v8 . eval ( << ~ JS )
2019-12-03 11:32:33 -05:00
__performEmojiEscape ( #{title.inspect}, {
emojiShortcuts : #{replace_emoji_shortcuts},
inlineEmoji : #{SiteSetting.enable_inline_emoji_translation}
} ) ;
2019-03-21 04:11:33 -04:00
JS
end
end
2013-02-05 14:16:51 -05:00
def self . cook ( text , opts = { } )
2015-04-23 13:33:29 -04:00
options = opts . dup
2015-12-30 14:35:25 -05:00
working_text = text . dup
2016-11-08 16:36:34 -05:00
2018-02-26 15:48:59 -05:00
sanitized = markdown ( working_text , options )
2015-05-25 21:13:12 -04:00
2020-05-04 23:46:57 -04:00
doc = Nokogiri :: HTML5 . fragment ( sanitized )
2015-05-25 21:13:12 -04:00
2020-09-10 11:59:51 -04:00
add_nofollow = ! options [ :omit_nofollow ] && SiteSetting . add_rel_nofollow_to_user_content
add_rel_attributes_to_user_content ( doc , add_nofollow )
2021-11-21 19:43:03 -05:00
strip_hidden_unicode_bidirectional_characters ( doc )
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 10:23:04 -04:00
sanitize_hotlinked_media ( doc )
2015-05-25 21:13:12 -04:00
2018-11-22 03:01:03 -05:00
if SiteSetting . enable_mentions
add_mentions ( doc , user_id : opts [ :user_id ] )
end
2018-11-22 01:28:48 -05:00
2020-05-04 23:46:57 -04:00
scrubber = Loofah :: Scrubber . new do | node |
node . remove if node . name == 'script'
end
loofah_fragment = Loofah . fragment ( doc . to_html )
loofah_fragment . scrub! ( scrubber ) . to_html
2015-05-25 21:13:12 -04:00
end
2021-11-21 19:43:03 -05:00
def self . strip_hidden_unicode_bidirectional_characters ( doc )
return if ! DANGEROUS_BIDI_REGEXP . match? ( doc . content )
doc . css ( " code,pre " ) . each do | code_tag |
next if ! DANGEROUS_BIDI_REGEXP . match? ( code_tag . content )
DANGEROUS_BIDI_CHARACTERS . each do | bidi |
next if ! code_tag . content . include? ( bidi )
formatted = " <U+ #{ bidi . ord . to_s ( 16 ) . upcase } > "
code_tag . inner_html = code_tag . inner_html . gsub (
bidi ,
" <span class= \" bidi-warning \" title= \" #{ I18n . t ( " post.hidden_bidi_character " ) } \" > #{ formatted } </span> "
)
end
end
end
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 10:23:04 -04:00
def self . sanitize_hotlinked_media ( doc )
return if ! SiteSetting . block_hotlinked_media
allowed_pattern = allowed_src_pattern
doc . css ( " img[src], source[src], source[srcset], track[src] " ) . each do | el |
if el [ " src " ] && ! el [ " src " ] . match? ( allowed_pattern )
el [ PrettyText :: BLOCKED_HOTLINKED_SRC_ATTR ] = el . delete ( " src " )
end
if el [ " srcset " ]
srcs = el [ " srcset " ] . split ( ',' ) . map { | e | e . split ( ' ' , 2 ) [ 0 ] . presence }
if srcs . any? { | src | ! src . match? ( allowed_pattern ) }
el [ PrettyText :: BLOCKED_HOTLINKED_SRCSET_ATTR ] = el . delete ( " srcset " )
end
end
end
end
2020-09-10 11:59:51 -04:00
def self . add_rel_attributes_to_user_content ( doc , add_nofollow )
2020-07-26 20:23:54 -04:00
allowlist = [ ]
2013-02-11 03:01:33 -05:00
2013-11-19 22:38:21 -05:00
domains = SiteSetting . exclude_rel_nofollow_domains
2020-07-26 20:23:54 -04:00
allowlist = domains . split ( '|' ) if domains . present?
2013-02-11 03:01:33 -05:00
2013-02-10 19:43:07 -05:00
site_uri = nil
doc . css ( " a " ) . each do | l |
href = l [ " href " ] . to_s
2020-09-10 11:59:51 -04:00
l [ " rel " ] = " noopener " if l [ " target " ] == " _blank "
2013-02-25 11:42:20 -05:00
begin
2020-05-07 21:14:59 -04:00
uri = URI ( UrlHelper . encode_component ( href ) )
2013-02-10 19:43:07 -05:00
site_uri || = URI ( Discourse . base_url )
2013-02-25 11:42:20 -05:00
2020-09-10 11:59:51 -04:00
same_domain = ! uri . host . present? ||
uri . host == site_uri . host ||
uri . host . ends_with? ( " . #{ site_uri . host } " ) ||
allowlist . any? { | u | uri . host == u || uri . host . ends_with? ( " . #{ u } " ) }
l [ " rel " ] = " noopener nofollow ugc " if add_nofollow && ! same_domain
2018-08-14 06:23:32 -04:00
rescue URI :: Error
2013-02-25 11:42:20 -05:00
# add a nofollow anyway
2020-09-10 11:59:51 -04:00
l [ " rel " ] = " noopener nofollow ugc "
2013-02-10 19:43:07 -05:00
end
end
2013-02-05 14:16:51 -05:00
end
2017-02-06 08:45:04 -05:00
class DetectedLink < Struct . new ( :url , :is_quote ) ; end
2014-07-11 00:17:01 -04:00
2013-02-05 14:16:51 -05:00
def self . extract_links ( html )
links = [ ]
2020-05-04 23:46:57 -04:00
doc = Nokogiri :: HTML5 . fragment ( html )
2017-02-06 08:45:04 -05:00
2021-06-18 11:55:24 -04:00
# extract onebox links
doc . css ( " aside.onebox[data-onebox-src] " ) . each { | onebox | links << DetectedLink . new ( onebox [ " data-onebox-src " ] , false ) }
# remove href inside quotes & oneboxes & elided part
doc . css ( " aside.quote a, aside.onebox a, .elided a " ) . remove
2014-07-11 00:17:01 -04:00
2022-01-23 19:33:23 -05:00
# remove hotlinked images
doc . css ( " a.onebox > img " ) . each { | img | img . parent . remove }
2017-02-06 08:45:04 -05:00
# extract all links
doc . css ( " a " ) . each do | a |
2020-04-30 02:48:34 -04:00
if a [ " href " ] . present? && a [ " href " ] [ 0 ] != " # "
2017-02-06 08:45:04 -05:00
links << DetectedLink . new ( a [ " href " ] , false )
2014-07-11 00:17:01 -04:00
end
2017-02-06 08:45:04 -05:00
end
2013-02-25 11:42:20 -05:00
2017-02-06 08:45:04 -05:00
# extract quotes
doc . css ( " aside.quote[data-topic] " ) . each do | aside |
if aside [ " data-topic " ] . present?
2021-02-11 13:21:13 -05:00
url = + " /t/ #{ aside [ " data-topic " ] } "
2017-02-06 08:45:04 -05:00
url << " / #{ aside [ " data-post " ] } " if aside [ " data-post " ] . present?
links << DetectedLink . new ( url , true )
2013-02-13 15:22:04 -05:00
end
end
2017-02-06 08:45:04 -05:00
# extract Youtube links
doc . css ( " div[data-youtube-id] " ) . each do | div |
if div [ " data-youtube-id " ] . present?
links << DetectedLink . new ( " https://www.youtube.com/watch?v= #{ div [ 'data-youtube-id' ] } " , false )
end
2016-09-22 16:50:05 -04:00
end
2013-02-05 14:16:51 -05:00
links
end
2013-05-27 19:48:47 -04:00
def self . excerpt ( html , max_length , options = { } )
2014-11-05 14:37:00 -05:00
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
2020-05-04 23:46:57 -04:00
doc = Nokogiri :: HTML5 . fragment ( html )
2019-05-29 11:05:52 -04:00
DiscourseEvent . trigger ( :reduce_excerpt , doc , options )
2014-11-05 14:37:00 -05:00
strip_image_wrapping ( doc )
2020-02-06 15:08:13 -05:00
strip_oneboxed_media ( doc )
2014-11-05 14:37:00 -05:00
html = doc . to_html
2013-05-27 19:48:47 -04:00
ExcerptParser . get_excerpt ( html , max_length , options )
end
2013-02-05 14:16:51 -05:00
2013-06-05 15:28:10 -04:00
def self . strip_links ( string )
return string if string . blank?
# If the user is not basic, strip links from their bio
2020-05-04 23:46:57 -04:00
fragment = Nokogiri :: HTML5 . fragment ( string )
2014-09-17 12:08:00 -04:00
fragment . css ( 'a' ) . each { | a | a . replace ( a . inner_html ) }
2013-06-05 15:28:10 -04:00
fragment . to_html
end
2018-05-09 13:24:44 -04:00
def self . make_all_links_absolute ( doc )
site_uri = nil
doc . css ( " a " ) . each do | link |
href = link [ " href " ] . to_s
begin
uri = URI ( href )
site_uri || = URI ( Discourse . base_url )
2018-06-08 13:11:52 -04:00
unless uri . host . present? || href . start_with? ( 'mailto' )
link [ " href " ] = " #{ site_uri } #{ link [ 'href' ] } "
2018-06-08 13:56:20 -04:00
end
2018-08-14 06:23:32 -04:00
rescue URI :: Error
2018-05-09 13:24:44 -04:00
# leave it
end
end
end
2016-06-21 11:12:30 -04:00
2014-04-17 12:32:51 -04:00
def self . strip_image_wrapping ( doc )
doc . css ( " .lightbox-wrapper .meta " ) . remove
end
2020-02-06 15:08:13 -05:00
def self . strip_oneboxed_media ( doc )
doc . css ( " audio " ) . remove
2020-02-17 13:52:23 -05:00
doc . css ( " .video-onebox,video " ) . remove
2020-02-06 15:08:13 -05:00
end
2018-05-09 13:24:44 -04:00
def self . convert_vimeo_iframes ( doc )
doc . css ( " iframe[src*='player.vimeo.com'] " ) . each do | iframe |
2019-04-26 07:39:18 -04:00
if iframe [ " data-original-href " ] . present?
2022-08-09 06:28:29 -04:00
vimeo_url = UrlHelper . normalized_encode ( iframe [ " data-original-href " ] )
2019-04-26 07:39:18 -04:00
else
vimeo_id = iframe [ 'src' ] . split ( '/' ) . last
vimeo_url = " https://vimeo.com/ #{ vimeo_id } "
end
2020-12-09 07:58:36 -05:00
iframe . replace Nokogiri :: HTML5 . fragment ( " <p><a href=' #{ vimeo_url } '> #{ vimeo_url } </a></p> " )
2018-05-09 13:24:44 -04:00
end
end
2019-11-17 20:25:42 -05:00
def self . strip_secure_media ( doc )
2020-11-01 18:52:21 -05:00
# images inside a lightbox or other link
doc . css ( 'a[href]' ) . each do | a |
next if ! Upload . secure_media_url? ( a [ 'href' ] )
non_image_media = %w( video audio ) . include? ( a & . parent & . name )
target = non_image_media ? a . parent : a
next if target . to_s . include? ( 'stripped-secure-view-media' )
2020-11-04 15:45:50 -05:00
next if a . css ( 'img[src]' ) . empty? && ! non_image_media
2020-11-01 18:52:21 -05:00
if a . classes . include? ( 'lightbox' )
img = a . css ( 'img[src]' ) . first
2020-11-04 15:45:50 -05:00
srcset = img & . attributes [ 'srcset' ] & . value
if srcset
# if available, use the first image from the srcset here
# so we get the optimized image instead of the possibly huge original
url = srcset . split ( ',' ) . first
else
url = img [ 'src' ]
end
2020-11-01 18:52:21 -05:00
a . add_next_sibling secure_media_placeholder ( doc , url , width : img [ 'width' ] , height : img [ 'height' ] )
a . remove
else
width = non_image_media ? nil : a . at_css ( 'img' ) . attr ( 'width' )
height = non_image_media ? nil : a . at_css ( 'img' ) . attr ( 'height' )
2020-10-21 22:25:09 -04:00
target . add_next_sibling secure_media_placeholder ( doc , a [ 'href' ] , width : width , height : height )
2020-09-09 19:50:16 -04:00
target . remove
2019-11-17 20:25:42 -05:00
end
end
2020-11-01 18:52:21 -05:00
# images by themselves or inside a onebox
2020-09-09 19:50:16 -04:00
doc . css ( 'img[src]' ) . each do | img |
2020-11-03 11:53:15 -05:00
url = if img . parent . classes . include? ( " aspect-image " ) && img . attributes [ " srcset " ] . present?
2020-11-01 18:52:21 -05:00
# we are using the first image from the srcset here so we get the
# optimized image instead of the original, because an optimized
# image may be used for the onebox thumbnail
srcset = img . attributes [ " srcset " ] . value
srcset . split ( " , " ) . first
else
img [ 'src' ]
end
2020-11-09 21:55:18 -05:00
width = img [ 'width' ]
height = img [ 'height' ]
2020-11-15 18:58:40 -05:00
onebox_type = nil
if img . ancestors . css ( " .onebox-body " ) . any?
if img . classes . include? ( " onebox-avatar-inline " )
onebox_type = " avatar-inline "
else
onebox_type = " thumbnail "
end
end
2020-11-09 21:55:18 -05:00
# we always want this to be tiny and without any special styles
if img . classes . include? ( 'site-icon' )
2020-11-15 18:58:40 -05:00
onebox_type = nil
2020-11-09 21:55:18 -05:00
width = 16
height = 16
end
2020-11-01 18:52:21 -05:00
if Upload . secure_media_url? ( url )
2020-11-15 18:58:40 -05:00
img . add_next_sibling secure_media_placeholder ( doc , url , onebox_type : onebox_type , width : width , height : height )
2020-09-09 19:50:16 -04:00
img . remove
end
end
end
2020-11-15 18:58:40 -05:00
def self . secure_media_placeholder ( doc , url , onebox_type : false , width : nil , height : nil )
2020-10-21 22:25:09 -04:00
data_width = width ? " data-width= #{ width } " : ''
data_height = height ? " data-height= #{ height } " : ''
2020-11-15 18:58:40 -05:00
data_onebox_type = onebox_type ? " data-onebox-type=' #{ onebox_type } ' " : ''
2020-09-09 19:50:16 -04:00
<< ~ HTML
2020-11-15 18:58:40 -05:00
< div class = " secure-media-notice " data - stripped - secure - media = " #{ url } " #{data_onebox_type} #{data_width} #{data_height}>
2020-09-09 19:50:16 -04:00
#{I18n.t('emails.secure_media_placeholder')} <a class='stripped-secure-view-media' href="#{url}">#{I18n.t("emails.view_redacted_media")}</a>.
< / div>
HTML
2019-11-17 20:25:42 -05:00
end
2016-06-21 11:12:30 -04:00
def self . format_for_email ( html , post = nil )
2020-05-04 23:46:57 -04:00
doc = Nokogiri :: HTML5 . fragment ( html )
2016-06-21 11:12:30 -04:00
DiscourseEvent . trigger ( :reduce_cooked , doc , post )
2019-11-17 20:25:42 -05:00
strip_secure_media ( doc ) if post & . with_secure_media?
2016-06-21 11:12:30 -04:00
strip_image_wrapping ( doc )
2018-05-09 13:24:44 -04:00
convert_vimeo_iframes ( doc )
2016-06-21 11:12:30 -04:00
make_all_links_absolute ( doc )
doc . to_html
2013-11-28 15:57:21 -05:00
end
2013-05-27 19:48:47 -04:00
protected
2013-02-05 14:16:51 -05:00
2014-02-03 19:12:53 -05:00
class JavaScriptError < StandardError
attr_accessor :message , :backtrace
def initialize ( message , backtrace )
@message = message
@backtrace = backtrace
end
end
def self . protect
rval = nil
@mutex . synchronize do
2016-05-19 08:25:08 -04:00
rval = yield
2014-02-03 19:12:53 -05:00
end
rval
end
2013-08-15 18:12:10 -04:00
def self . ctx_load ( ctx , * files )
2013-05-27 19:48:47 -04:00
files . each do | file |
2013-08-15 18:12:10 -04:00
ctx . load ( app_root + file )
2013-02-05 14:16:51 -05:00
end
end
2018-11-22 01:28:48 -05:00
private
USER_TYPE || = 'user'
GROUP_TYPE || = 'group'
2020-02-18 15:45:02 -05:00
GROUP_MENTIONABLE_TYPE || = 'group-mentionable'
2018-11-22 01:28:48 -05:00
2018-11-22 03:01:03 -05:00
def self . add_mentions ( doc , user_id : nil )
2018-11-22 01:28:48 -05:00
elements = doc . css ( " span.mention " )
2018-11-22 19:31:52 -05:00
names = elements . map { | element | element . text [ 1 .. - 1 ] }
2018-11-22 01:28:48 -05:00
2018-11-22 03:01:03 -05:00
mentions = lookup_mentions ( names , user_id : user_id )
2018-11-22 01:28:48 -05:00
2020-02-10 12:31:42 -05:00
elements . each do | element |
2018-11-22 01:28:48 -05:00
name = element . text [ 1 .. - 1 ]
name . downcase!
if type = mentions [ name ]
element . name = 'a'
element . children = PrettyText :: Helpers . format_username (
element . children . text
)
case type
when USER_TYPE
2020-10-09 07:51:24 -04:00
element [ 'href' ] = " #{ Discourse . base_path } /u/ #{ UrlHelper . encode_component ( name ) } "
2020-02-18 15:45:02 -05:00
when GROUP_MENTIONABLE_TYPE
element [ 'class' ] = 'mention-group notify'
2020-10-09 07:51:24 -04:00
element [ 'href' ] = " #{ Discourse . base_path } /groups/ #{ UrlHelper . encode_component ( name ) } "
2018-11-22 01:28:48 -05:00
when GROUP_TYPE
element [ 'class' ] = 'mention-group'
2020-10-09 07:51:24 -04:00
element [ 'href' ] = " #{ Discourse . base_path } /groups/ #{ UrlHelper . encode_component ( name ) } "
2018-11-22 01:28:48 -05:00
end
end
end
end
2018-11-22 03:01:03 -05:00
def self . lookup_mentions ( names , user_id : nil )
2018-11-22 03:42:56 -05:00
return { } if names . blank?
2018-11-22 01:28:48 -05:00
sql = << ~ SQL
(
SELECT
:user_type AS type ,
username_lower AS name
FROM users
2018-11-22 02:00:46 -05:00
WHERE username_lower IN ( :names ) AND staged = false
2018-11-22 01:28:48 -05:00
)
UNION
(
SELECT
:group_type AS type ,
2018-11-26 10:34:56 -05:00
lower ( name ) AS name
2018-11-22 01:28:48 -05:00
FROM groups
)
2020-02-18 15:45:02 -05:00
UNION
(
SELECT
:group_mentionable_type AS type ,
lower ( name ) AS name
FROM groups
WHERE lower ( name ) IN ( :names ) AND ( #{Group.mentionable_sql_clause(include_public: false)})
)
ORDER BY type
2018-11-22 01:28:48 -05:00
SQL
2018-11-22 03:01:03 -05:00
user = User . find_by ( id : user_id )
2018-11-22 03:32:56 -05:00
names . each ( & :downcase! )
2018-11-22 03:01:03 -05:00
2018-11-22 01:28:48 -05:00
results = DB . query ( sql ,
names : names ,
user_type : USER_TYPE ,
2018-11-22 03:01:03 -05:00
group_type : GROUP_TYPE ,
2020-02-18 15:45:02 -05:00
group_mentionable_type : GROUP_MENTIONABLE_TYPE ,
2018-11-22 03:01:03 -05:00
levels : Group . alias_levels ( user ) ,
user_id : user_id
2018-11-22 01:28:48 -05:00
)
mentions = { }
results . each { | result | mentions [ result . name ] = result . type }
mentions
end
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 10:23:04 -04:00
def self . allowed_src_pattern
allowed_src_prefixes = [
Discourse . base_path ,
Discourse . base_url ,
GlobalSetting . s3_cdn_url ,
GlobalSetting . cdn_url ,
SiteSetting . external_emoji_url . presence ,
* SiteSetting . block_hotlinked_media_exceptions . split ( " | " )
]
patterns = allowed_src_prefixes . compact . map do | url |
pattern = Regexp . escape ( url )
# If 'https://example.com' is allowed, ensure 'https://example.com.blah.com' is not
pattern += '(?:/|\z)' if ! pattern . ends_with? ( " \ / " )
pattern
end
/ \ A(data:| #{ patterns . join ( " | " ) } ) /
end
2013-02-05 14:16:51 -05:00
end