discourse/lib/pretty_text.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

776 lines
25 KiB
Ruby
Raw Normal View History

# frozen_string_literal: true
require "mini_racer"
2013-02-05 14:16:51 -05:00
require "nokogiri"
require "erb"
2013-02-05 14:16:51 -05:00
module PrettyText
DANGEROUS_BIDI_CHARACTERS = [
"\u202A",
"\u202B",
"\u202C",
"\u202D",
"\u202E",
"\u2066",
"\u2067",
"\u2068",
"\u2069",
].freeze
DANGEROUS_BIDI_REGEXP = Regexp.new(DANGEROUS_BIDI_CHARACTERS.join("|")).freeze
BLOCKED_HOTLINKED_SRC_ATTR = "data-blocked-hotlinked-src"
BLOCKED_HOTLINKED_SRCSET_ATTR = "data-blocked-hotlinked-srcset"
@mutex = Mutex.new
@ctx_init = Mutex.new
2013-02-05 14:16:51 -05:00
def self.app_root
Rails.root
end
def self.find_file(root, filename)
return filename if File.file?("#{root}#{filename}")
es6_name = "#{filename}.js.es6"
return es6_name if File.file?("#{root}#{es6_name}")
2013-02-05 14:16:51 -05:00
js_name = "#{filename}.js"
return js_name if File.file?("#{root}#{js_name}")
erb_name = "#{filename}.js.es6.erb"
return erb_name if File.file?("#{root}#{erb_name}")
erb_name = "#{filename}.js.erb"
return erb_name if File.file?("#{root}#{erb_name}")
end
def self.apply_es6_file(ctx, root_path, part_name)
filename = find_file(root_path, part_name)
if filename
source = File.read("#{root_path}#{filename}")
source = ERB.new(source).result(binding) if filename =~ /\.erb\z/
transpiler = DiscourseJsProcessor::Transpiler.new
transpiled = transpiler.perform(source, "#{Rails.root}/app/assets/javascripts/", part_name)
ctx.eval(transpiled)
else
# Look for vendored stuff
vendor_root = "#{Rails.root}/vendor/assets/javascripts/"
filename = find_file(vendor_root, part_name)
ctx.eval(File.read("#{vendor_root}#{filename}")) if filename
end
2013-02-05 14:16:51 -05:00
end
def self.ctx_load_directory(ctx, path)
root_path = "#{Rails.root}/app/assets/javascripts/"
Dir["#{root_path}#{path}/**/*"].sort.each do |f|
apply_es6_file(ctx, root_path, f.sub(root_path, "").sub(/\.js(.es6)?\z/, ""))
end
end
def self.create_es6_context
ctx = MiniRacer::Context.new(timeout: 25_000, ensure_gc_after_idle: 2000)
ctx.eval("window = {}; window.devicePixelRatio = 2;") # hack to make code think stuff is retina
ctx.attach("rails.logger.info", proc { |err| Rails.logger.info(err.to_s) })
ctx.attach("rails.logger.warn", proc { |err| Rails.logger.warn(err.to_s) })
ctx.attach("rails.logger.error", proc { |err| Rails.logger.error(err.to_s) })
ctx.eval <<~JS
console = {
prefix: "[PrettyText] ",
log: function(...args){ rails.logger.info(console.prefix + args.join(" ")); },
warn: function(...args){ rails.logger.warn(console.prefix + args.join(" ")); },
error: function(...args){ rails.logger.error(console.prefix + args.join(" ")); }
}
JS
ctx.eval("__PRETTY_TEXT = true")
PrettyText::Helpers.instance_methods.each do |method|
ctx.attach("__helpers.#{method}", PrettyText::Helpers.method(method))
end
root_path = "#{Rails.root}/app/assets/javascripts/"
ctx_load(ctx, "#{root_path}/node_modules/loader.js/dist/loader/loader.js")
ctx_load(ctx, "#{root_path}/handlebars-shim.js")
ctx_load(ctx, "#{root_path}/node_modules/xss/dist/xss.js")
ctx.load("#{Rails.root}/lib/pretty_text/vendor-shims.js")
ctx_load_directory(ctx, "pretty-text/addon")
ctx_load_directory(ctx, "pretty-text/engines/discourse-markdown")
ctx_load(ctx, "#{root_path}/node_modules/markdown-it/dist/markdown-it.js")
2013-02-05 14:16:51 -05:00
apply_es6_file(ctx, root_path, "discourse-common/addon/lib/get-url")
apply_es6_file(ctx, root_path, "discourse-common/addon/lib/object")
apply_es6_file(ctx, root_path, "discourse-common/addon/lib/deprecated")
apply_es6_file(ctx, root_path, "discourse-common/addon/lib/escape")
apply_es6_file(ctx, root_path, "discourse-common/addon/utils/watched-words")
apply_es6_file(ctx, root_path, "discourse/app/lib/to-markdown")
apply_es6_file(ctx, root_path, "discourse/app/lib/utilities")
ctx.load("#{Rails.root}/lib/pretty_text/shims.js")
ctx.eval("__setUnicode(#{Emoji.unicode_replacements_json})")
2013-08-08 18:14:12 -04:00
to_load = []
DiscoursePluginRegistry.each_globbed_asset do |a|
to_load << a if File.file?(a) && a =~ /discourse-markdown/
end
to_load.uniq.each do |f|
if f =~ %r{\A.+assets/javascripts/}
root = Regexp.last_match[0]
apply_es6_file(ctx, root, f.sub(root, "").sub(/\.js(\.es6)?\z/, ""))
2013-02-05 14:16:51 -05:00
end
end
DiscoursePluginRegistry.vendored_core_pretty_text.each { |vpt| ctx.eval(File.read(vpt)) }
DiscoursePluginRegistry.vendored_pretty_text.each { |vpt| ctx.eval(File.read(vpt)) }
ctx
end
def self.v8
return @ctx if @ctx
# ensure we only init one of these
@ctx_init.synchronize do
return @ctx if @ctx
@ctx = create_es6_context
end
2014-04-14 16:55:57 -04:00
2013-02-05 14:16:51 -05:00
@ctx
end
def self.reset_translations
v8.eval("__resetTranslationTree()")
end
def self.reset_context
@ctx_init.synchronize do
2017-07-20 00:17:45 -04:00
@ctx&.dispose
@ctx = nil
end
end
# Acceptable options:
#
# disable_emojis - Disables the emoji markdown engine.
# features - A hash where the key is the markdown feature name and the value is a boolean to enable/disable the markdown feature.
# The hash is merged into the default features set in pretty-text.js which can be used to add new features or disable existing features.
# features_override - An array of markdown feature names to override the default markdown feature set. Currently used by plugins to customize what features should be enabled
# when rendering markdown.
# markdown_it_rules - An array of markdown rule names which will be applied to the markdown-it engine. Currently used by plugins to customize what markdown-it rules should be
# enabled when rendering markdown.
# topic_id - Topic id for the post being cooked.
# user_id - User id for the post being cooked.
# force_quote_link - Always create the link to the quoted topic for [quote] bbcode. Normally this only happens
# if the topic_id provided is different from the [quote topic:X].
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937) This commit fleshes out and adds functionality for the new `#hashtag` search and lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete` feature flag. **Serverside** We have two plugin API registration methods that are used to define data sources (`register_hashtag_data_source`) and hashtag result type priorities depending on the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb should make it clear what these are doing. Reading the `HashtagAutocompleteService` in full will likely help a lot as well. Each data source is responsible for providing its own **lookup** and **search** method that returns hashtag results based on the arguments provided. For example, the category hashtag data source has to take into account parent categories and how they relate, and each data source has to define their own icon to use for the hashtag, and so on. The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`. There is `hashtag_icons` that is just a simple array of all the different icons that can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations` that is used to store the type priority orders for each registered context. When sending emails, we cannot render the SVG icons for hashtags, so we need to change the HTML hashtags to the normal `#hashtag` text. **Markdown** The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete` markdown rule, and like all of our rules this is used to cook the raw text on both the clientside and on the serverside using MiniRacer. Only on the server side do we actually reach out to the database with the `hashtagLookup` function, on the clientside we just render a plainer version of the hashtag HTML. Only in the composer preview do we do further lookups based on this. This rule is the first one (that I can find) that uses the `currentUser` based on a passed in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id` for both the post and chat message. In some cases we need to cook without a user present, so the `Discourse.system_user` is used in this case. **Chat Channels** This also contains the changes required for chat so that chat channels can be used as a data source for hashtag searches and lookups. This data source will only be used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have to worry about channel results suddenly turning up. ------ **Known Rough Edges** - Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR - Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR - Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future - Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity) - Additional refinements and review fixes wil
2022-11-20 17:37:06 -05:00
# hashtag_context - Defaults to "topic-composer" if not supplied. Controls the order of #hashtag lookup results
# based on registered hashtag contexts from the `#register_hashtag_search_param` plugin API
# method.
def self.markdown(text, opts = {})
2013-02-05 14:16:51 -05:00
# we use the exact same markdown converter as the client
2013-02-25 11:42:20 -05:00
# TODO: use the same extensions on both client and server (in particular the template for mentions)
2013-02-05 14:16:51 -05:00
baked = nil
text = text || ""
2013-02-05 14:16:51 -05:00
protect do
context = v8
custom_emoji = {}
Emoji.custom.map { |e| custom_emoji[e.name] = e.url }
# note, any additional options added to __optInput here must be
# also be added to the buildOptions function in pretty-text.js,
# otherwise they will be discarded
buffer = +<<~JS
__optInput = {};
__optInput.siteSettings = #{SiteSetting.client_settings_json};
#{"__optInput.disableEmojis = true" if opts[:disable_emojis]}
__paths = #{paths_json};
__optInput.getURL = __getURL;
#{"__optInput.features = #{opts[:features].to_json};" if opts[:features]}
#{"__optInput.featuresOverride = #{opts[:features_override].to_json};" if opts[:features_override]}
#{"__optInput.markdownItRules = #{opts[:markdown_it_rules].to_json};" if opts[:markdown_it_rules]}
__optInput.getCurrentUser = __getCurrentUser;
__optInput.lookupAvatar = __lookupAvatar;
__optInput.lookupPrimaryUserGroup = __lookupPrimaryUserGroup;
__optInput.formatUsername = __formatUsername;
__optInput.getTopicInfo = __getTopicInfo;
__optInput.categoryHashtagLookup = __categoryLookup;
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937) This commit fleshes out and adds functionality for the new `#hashtag` search and lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete` feature flag. **Serverside** We have two plugin API registration methods that are used to define data sources (`register_hashtag_data_source`) and hashtag result type priorities depending on the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb should make it clear what these are doing. Reading the `HashtagAutocompleteService` in full will likely help a lot as well. Each data source is responsible for providing its own **lookup** and **search** method that returns hashtag results based on the arguments provided. For example, the category hashtag data source has to take into account parent categories and how they relate, and each data source has to define their own icon to use for the hashtag, and so on. The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`. There is `hashtag_icons` that is just a simple array of all the different icons that can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations` that is used to store the type priority orders for each registered context. When sending emails, we cannot render the SVG icons for hashtags, so we need to change the HTML hashtags to the normal `#hashtag` text. **Markdown** The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete` markdown rule, and like all of our rules this is used to cook the raw text on both the clientside and on the serverside using MiniRacer. Only on the server side do we actually reach out to the database with the `hashtagLookup` function, on the clientside we just render a plainer version of the hashtag HTML. Only in the composer preview do we do further lookups based on this. This rule is the first one (that I can find) that uses the `currentUser` based on a passed in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id` for both the post and chat message. In some cases we need to cook without a user present, so the `Discourse.system_user` is used in this case. **Chat Channels** This also contains the changes required for chat so that chat channels can be used as a data source for hashtag searches and lookups. This data source will only be used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have to worry about channel results suddenly turning up. ------ **Known Rough Edges** - Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR - Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR - Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future - Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity) - Additional refinements and review fixes wil
2022-11-20 17:37:06 -05:00
__optInput.hashtagLookup = __hashtagLookup;
__optInput.customEmoji = #{custom_emoji.to_json};
__optInput.customEmojiTranslation = #{Plugin::CustomEmoji.translations.to_json};
2017-06-28 13:47:22 -04:00
__optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
__optInput.emojiDenyList = #{Emoji.denied.to_json};
__optInput.lookupUploadUrls = __lookupUploadUrls;
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json};
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json};
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json};
__optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
JS
buffer << "__optInput.topicId = #{opts[:topic_id].to_i};\n" if opts[:topic_id]
if opts[:force_quote_link]
buffer << "__optInput.forceQuoteLink = #{opts[:force_quote_link]};\n"
end
buffer << "__optInput.userId = #{opts[:user_id].to_i};\n" if opts[:user_id]
FEATURE: Generic hashtag autocomplete lookup and markdown cooking (#18937) This commit fleshes out and adds functionality for the new `#hashtag` search and lookup system, still hidden behind the `enable_experimental_hashtag_autocomplete` feature flag. **Serverside** We have two plugin API registration methods that are used to define data sources (`register_hashtag_data_source`) and hashtag result type priorities depending on the context (`register_hashtag_type_in_context`). Reading the comments in plugin.rb should make it clear what these are doing. Reading the `HashtagAutocompleteService` in full will likely help a lot as well. Each data source is responsible for providing its own **lookup** and **search** method that returns hashtag results based on the arguments provided. For example, the category hashtag data source has to take into account parent categories and how they relate, and each data source has to define their own icon to use for the hashtag, and so on. The `Site` serializer has two new attributes that source data from `HashtagAutocompleteService`. There is `hashtag_icons` that is just a simple array of all the different icons that can be used for allowlisting in our markdown pipeline, and there is `hashtag_context_configurations` that is used to store the type priority orders for each registered context. When sending emails, we cannot render the SVG icons for hashtags, so we need to change the HTML hashtags to the normal `#hashtag` text. **Markdown** The `hashtag-autocomplete.js` file is where I have added the new `hashtag-autocomplete` markdown rule, and like all of our rules this is used to cook the raw text on both the clientside and on the serverside using MiniRacer. Only on the server side do we actually reach out to the database with the `hashtagLookup` function, on the clientside we just render a plainer version of the hashtag HTML. Only in the composer preview do we do further lookups based on this. This rule is the first one (that I can find) that uses the `currentUser` based on a passed in `user_id` for guardian checks in markdown rendering code. This is the `last_editor_id` for both the post and chat message. In some cases we need to cook without a user present, so the `Discourse.system_user` is used in this case. **Chat Channels** This also contains the changes required for chat so that chat channels can be used as a data source for hashtag searches and lookups. This data source will only be used when `enable_experimental_hashtag_autocomplete` is `true`, so we don't have to worry about channel results suddenly turning up. ------ **Known Rough Edges** - Onebox excerpts will not render the icon svg/use tags, I plan to address that in a follow up PR - Selecting a hashtag + pressing the Quote button will result in weird behaviour, I plan to address that in a follow up PR - Mixed hashtag contexts for hashtags without a type suffix will not work correctly, e.g. #ux which is both a category and a channel slug will resolve to a category when used inside a post or within a [chat] transcript in that post. Users can get around this manually by adding the correct suffix, for example ::channel. We may get to this at some point in future - Icons will not show for the hashtags in emails since SVG support is so terrible in email (this is not likely to be resolved, but still noting for posterity) - Additional refinements and review fixes wil
2022-11-20 17:37:06 -05:00
opts[:hashtag_context] = opts[:hashtag_context] || "topic-composer"
hashtag_types_as_js =
HashtagAutocompleteService
.ordered_types_for_context(opts[:hashtag_context])
.map { |t| "'#{t}'" }
.join(",")
hashtag_icons_as_js =
HashtagAutocompleteService.data_source_icons.map { |i| "'#{i}'" }.join(",")
buffer << "__optInput.hashtagTypesInPriorityOrder = [#{hashtag_types_as_js}];\n"
buffer << "__optInput.hashtagIcons = [#{hashtag_icons_as_js}];\n"
buffer << "__textOptions = __buildOptions(__optInput);\n"
buffer << ("__pt = new __PrettyText(__textOptions);")
# Be careful disabling sanitization. We allow for custom emails
buffer << ("__pt.disableSanitizer();") if opts[:sanitize] == false
opts = context.eval(buffer)
DiscourseEvent.trigger(:markdown_context, context)
baked = context.eval("__pt.cook(#{text.inspect})")
2013-02-05 14:16:51 -05:00
end
baked
end
def self.paths_json
paths = { baseUri: Discourse.base_path, CDN: Rails.configuration.action_controller.asset_host }
if SiteSetting.Upload.enable_s3_uploads
paths[:S3CDN] = SiteSetting.Upload.s3_cdn_url if SiteSetting.Upload.s3_cdn_url.present?
paths[:S3BaseUrl] = Discourse.store.absolute_base_url
end
paths.to_json
end
2013-02-05 14:16:51 -05:00
# leaving this here, cause it invokes v8, don't want to implement twice
2013-08-13 16:08:29 -04:00
def self.avatar_img(avatar_template, size)
protect { v8.eval(<<~JS) }
__paths = #{paths_json};
__utils.avatarImg({size: #{size.inspect}, avatarTemplate: #{avatar_template.inspect}}, __getURL);
JS
2013-02-05 14:16:51 -05:00
end
2015-10-15 03:59:29 -04:00
def self.unescape_emoji(title)
return title unless SiteSetting.enable_emoji? && title
set = SiteSetting.emoji_set.inspect
custom = Emoji.custom.map { |e| [e.name, e.url] }.to_h.to_json
protect { v8.eval(<<~JS) }
__paths = #{paths_json};
__performEmojiUnescape(#{title.inspect}, {
getURL: __getURL,
emojiSet: #{set},
emojiCDNUrl: "#{SiteSetting.external_emoji_url.blank? ? "" : SiteSetting.external_emoji_url}",
customEmoji: #{custom},
enableEmojiShortcuts: #{SiteSetting.enable_emoji_shortcuts},
inlineEmoji: #{SiteSetting.enable_inline_emoji_translation}
});
JS
2015-10-15 03:59:29 -04:00
end
def self.escape_emoji(title)
return unless title
replace_emoji_shortcuts = SiteSetting.enable_emoji && SiteSetting.enable_emoji_shortcuts
protect { v8.eval(<<~JS) }
__performEmojiEscape(#{title.inspect}, {
emojiShortcuts: #{replace_emoji_shortcuts},
inlineEmoji: #{SiteSetting.enable_inline_emoji_translation}
});
JS
end
2013-02-05 14:16:51 -05:00
def self.cook(text, opts = {})
options = opts.dup
working_text = text.dup
sanitized = markdown(working_text, options)
doc = Nokogiri::HTML5.fragment(sanitized)
add_nofollow = !options[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content
add_rel_attributes_to_user_content(doc, add_nofollow)
strip_hidden_unicode_bidirectional_characters(doc)
sanitize_hotlinked_media(doc)
add_mentions(doc, user_id: opts[:user_id]) if SiteSetting.enable_mentions
scrubber = Loofah::Scrubber.new { |node| node.remove if node.name == "script" }
loofah_fragment = Loofah.fragment(doc.to_html)
loofah_fragment.scrub!(scrubber).to_html
end
def self.strip_hidden_unicode_bidirectional_characters(doc)
return if !DANGEROUS_BIDI_REGEXP.match?(doc.content)
doc
.css("code,pre")
.each do |code_tag|
next if !DANGEROUS_BIDI_REGEXP.match?(code_tag.content)
DANGEROUS_BIDI_CHARACTERS.each do |bidi|
next if !code_tag.content.include?(bidi)
formatted = "&lt;U+#{bidi.ord.to_s(16).upcase}&gt;"
code_tag.inner_html =
code_tag.inner_html.gsub(
bidi,
"<span class=\"bidi-warning\" title=\"#{I18n.t("post.hidden_bidi_character")}\">#{formatted}</span>",
)
end
end
end
def self.sanitize_hotlinked_media(doc)
return if !SiteSetting.block_hotlinked_media
allowed_pattern = allowed_src_pattern
doc
.css("img[src], source[src], source[srcset], track[src]")
.each do |el|
if el["src"] && !el["src"].match?(allowed_pattern)
el[PrettyText::BLOCKED_HOTLINKED_SRC_ATTR] = el.delete("src")
end
if el["srcset"]
srcs = el["srcset"].split(",").map { |e| e.split(" ", 2)[0].presence }
if srcs.any? { |src| !src.match?(allowed_pattern) }
el[PrettyText::BLOCKED_HOTLINKED_SRCSET_ATTR] = el.delete("srcset")
end
end
end
end
def self.add_rel_attributes_to_user_content(doc, add_nofollow)
allowlist = []
2013-02-11 03:01:33 -05:00
domains = SiteSetting.exclude_rel_nofollow_domains
allowlist = domains.split("|") if domains.present?
2013-02-11 03:01:33 -05:00
site_uri = nil
doc
.css("a")
.each do |l|
href = l["href"].to_s
l["rel"] = "noopener" if l["target"] == "_blank"
2013-02-25 11:42:20 -05:00
begin
uri = URI(UrlHelper.encode_component(href))
site_uri ||= URI(Discourse.base_url)
same_domain =
!uri.host.present? || uri.host == site_uri.host ||
uri.host.ends_with?(".#{site_uri.host}") ||
allowlist.any? { |u| uri.host == u || uri.host.ends_with?(".#{u}") }
l["rel"] = "noopener nofollow ugc" if add_nofollow && !same_domain
rescue URI::Error
2013-02-25 11:42:20 -05:00
# add a nofollow anyway
l["rel"] = "noopener nofollow ugc"
end
end
2013-02-05 14:16:51 -05:00
end
class DetectedLink < Struct.new(:url, :is_quote)
end
2014-07-11 00:17:01 -04:00
2013-02-05 14:16:51 -05:00
def self.extract_links(html)
links = []
doc = Nokogiri::HTML5.fragment(html)
# extract onebox links
doc
.css("aside.onebox[data-onebox-src]")
.each { |onebox| links << DetectedLink.new(onebox["data-onebox-src"], false) }
# remove href inside quotes & oneboxes & elided part
doc.css("aside.quote a, aside.onebox a, .elided a").remove
2014-07-11 00:17:01 -04:00
# remove hotlinked images
doc.css("a.onebox > img").each { |img| img.parent.remove }
# extract all links
doc
.css("a")
.each do |a|
links << DetectedLink.new(a["href"], false) if a["href"].present? && a["href"][0] != "#"
2014-07-11 00:17:01 -04:00
end
2013-02-25 11:42:20 -05:00
# extract quotes
doc
.css("aside.quote[data-topic]")
.each do |aside|
if aside["data-topic"].present?
url = +"/t/#{aside["data-topic"]}"
url << "/#{aside["data-post"]}" if aside["data-post"].present?
links << DetectedLink.new(url, true)
end
end
# extract Youtube links
doc
.css("div[data-video-id]")
.each do |div|
if div["data-video-id"].present? && div["data-provider-name"].present?
base_url =
case div["data-provider-name"]
when "youtube"
"https://www.youtube.com/watch?v="
when "vimeo"
"https://vimeo.com/"
when "tiktok"
"https://m.tiktok.com/v/"
end
links << DetectedLink.new(base_url + div["data-video-id"], false)
end
end
2013-02-05 14:16:51 -05:00
links
end
def self.extract_mentions(cooked)
mentions =
cooked
.css(".mention, .mention-group")
.map do |e|
if (name = e.inner_text)
name = name[1..-1]
name = User.normalize_username(name)
name
end
end
mentions.compact!
mentions.uniq!
mentions
end
2013-05-27 19:48:47 -04:00
def self.excerpt(html, max_length, options = {})
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
doc = Nokogiri::HTML5.fragment(html)
DiscourseEvent.trigger(:reduce_excerpt, doc, options)
strip_image_wrapping(doc)
strip_oneboxed_media(doc)
if SiteSetting.enable_experimental_hashtag_autocomplete && options[:plain_hashtags]
convert_hashtag_links_to_plaintext(doc)
end
html = doc.to_html
2013-05-27 19:48:47 -04:00
ExcerptParser.get_excerpt(html, max_length, options)
end
2013-02-05 14:16:51 -05:00
def self.convert_hashtag_links_to_plaintext(doc)
doc
.css("a.hashtag-cooked")
.each { |hashtag| hashtag.replace("##{hashtag.attributes["data-slug"]}") }
end
def self.strip_links(string)
return string if string.blank?
# If the user is not basic, strip links from their bio
fragment = Nokogiri::HTML5.fragment(string)
fragment.css("a").each { |a| a.replace(a.inner_html) }
fragment.to_html
end
def self.make_all_links_absolute(doc)
site_uri = nil
doc
.css("a")
.each do |link|
href = link["href"].to_s
begin
uri = URI(href)
site_uri ||= URI(Discourse.base_url)
2018-06-08 13:11:52 -04:00
unless uri.host.present? || href.start_with?("mailto")
link["href"] = "#{site_uri}#{link["href"]}"
end
rescue URI::Error
# leave it
2018-06-08 13:56:20 -04:00
end
end
end
def self.strip_image_wrapping(doc)
doc.css(".lightbox-wrapper .meta").remove
end
def self.strip_oneboxed_media(doc)
doc.css("audio").remove
doc.css(".video-onebox,video").remove
end
def self.convert_vimeo_iframes(doc)
doc
.css("iframe[src*='player.vimeo.com']")
.each do |iframe|
if iframe["data-original-href"].present?
vimeo_url = UrlHelper.normalized_encode(iframe["data-original-href"])
else
vimeo_id = iframe["src"].split("/").last.sub("?h=", "/")
vimeo_url = "https://vimeo.com/#{vimeo_id}"
end
iframe.replace Nokogiri::HTML5.fragment("<p><a href='#{vimeo_url}'>#{vimeo_url}</a></p>")
end
end
def self.strip_secure_uploads(doc)
# images inside a lightbox or other link
doc
.css("a[href]")
.each do |a|
next if !Upload.secure_uploads_url?(a["href"])
non_image_media = %w[video audio].include?(a&.parent&.name)
target = non_image_media ? a.parent : a
if target.to_s.include?("stripped-secure-view-media") ||
target.to_s.include?("stripped-secure-view-upload")
next
end
next if a.css("img[src]").empty? && !non_image_media
if a.classes.include?("lightbox")
img = a.css("img[src]").first
srcset = img&.attributes["srcset"]&.value
if srcset
# if available, use the first image from the srcset here
# so we get the optimized image instead of the possibly huge original
url = srcset.split(",").first
else
url = img["src"]
end
a.add_next_sibling secure_uploads_placeholder(
doc,
url,
width: img["width"],
height: img["height"],
)
a.remove
else
width = non_image_media ? nil : a.at_css("img").attr("width")
height = non_image_media ? nil : a.at_css("img").attr("height")
target.add_next_sibling secure_uploads_placeholder(
doc,
a["href"],
width: width,
height: height,
)
FEATURE: Allow email image embed with secure media (#10563) This PR introduces a few important changes to secure media redaction in emails. First of all, two new site settings have been introduced: * `secure_media_allow_embed_images_in_emails`: If enabled we will embed secure images in emails instead of redacting them. * `secure_media_max_email_embed_image_size_kb`: The cap to the size of the secure image we will embed, defaulting to 1mb, so the email does not become too big. Max is 10mb. Works in tandem with `email_total_attachment_size_limit_kb`. `Email::Sender` will now attach images to the email based on these settings. The sender will also call `inline_secure_images` in `Email::Styles` after secure media is redacted and attachments are added to replace redaction messages with attached images. I went with attachment and `cid` URLs because base64 image support is _still_ flaky in email clients. All redaction of secure media is now handled in `Email::Styles` and calls out to `PrettyText.strip_secure_media` to do the actual stripping and replacing with placeholders. `app/mailers/group_smtp_mailer.rb` and `app/mailers/user_notifications.rb` no longer do any stripping because they are earlier in the pipeline than `Email::Styles`. Finally the redaction notice has been restyled and includes a link to the media that the user can click, which will show it to them if they have the necessary permissions. ![image](https://user-images.githubusercontent.com/920448/92341012-b9a2c380-f0ff-11ea-860e-b376b4528357.png)
2020-09-09 19:50:16 -04:00
target.remove
end
end
# images by themselves or inside a onebox
FEATURE: Allow email image embed with secure media (#10563) This PR introduces a few important changes to secure media redaction in emails. First of all, two new site settings have been introduced: * `secure_media_allow_embed_images_in_emails`: If enabled we will embed secure images in emails instead of redacting them. * `secure_media_max_email_embed_image_size_kb`: The cap to the size of the secure image we will embed, defaulting to 1mb, so the email does not become too big. Max is 10mb. Works in tandem with `email_total_attachment_size_limit_kb`. `Email::Sender` will now attach images to the email based on these settings. The sender will also call `inline_secure_images` in `Email::Styles` after secure media is redacted and attachments are added to replace redaction messages with attached images. I went with attachment and `cid` URLs because base64 image support is _still_ flaky in email clients. All redaction of secure media is now handled in `Email::Styles` and calls out to `PrettyText.strip_secure_media` to do the actual stripping and replacing with placeholders. `app/mailers/group_smtp_mailer.rb` and `app/mailers/user_notifications.rb` no longer do any stripping because they are earlier in the pipeline than `Email::Styles`. Finally the redaction notice has been restyled and includes a link to the media that the user can click, which will show it to them if they have the necessary permissions. ![image](https://user-images.githubusercontent.com/920448/92341012-b9a2c380-f0ff-11ea-860e-b376b4528357.png)
2020-09-09 19:50:16 -04:00
doc
.css("img[src]")
.each do |img|
url =
if img.parent.classes.include?("aspect-image") && img.attributes["srcset"].present?
# we are using the first image from the srcset here so we get the
# optimized image instead of the original, because an optimized
# image may be used for the onebox thumbnail
srcset = img.attributes["srcset"].value
srcset.split(",").first
else
img["src"]
end
width = img["width"]
height = img["height"]
onebox_type = nil
if img.ancestors.css(".onebox-body").any?
if img.classes.include?("onebox-avatar-inline")
onebox_type = "avatar-inline"
else
onebox_type = "thumbnail"
end
end
# we always want this to be tiny and without any special styles
if img.classes.include?("site-icon")
onebox_type = nil
width = 16
height = 16
end
if Upload.secure_uploads_url?(url)
img.add_next_sibling secure_uploads_placeholder(
doc,
url,
onebox_type: onebox_type,
width: width,
height: height,
)
FEATURE: Allow email image embed with secure media (#10563) This PR introduces a few important changes to secure media redaction in emails. First of all, two new site settings have been introduced: * `secure_media_allow_embed_images_in_emails`: If enabled we will embed secure images in emails instead of redacting them. * `secure_media_max_email_embed_image_size_kb`: The cap to the size of the secure image we will embed, defaulting to 1mb, so the email does not become too big. Max is 10mb. Works in tandem with `email_total_attachment_size_limit_kb`. `Email::Sender` will now attach images to the email based on these settings. The sender will also call `inline_secure_images` in `Email::Styles` after secure media is redacted and attachments are added to replace redaction messages with attached images. I went with attachment and `cid` URLs because base64 image support is _still_ flaky in email clients. All redaction of secure media is now handled in `Email::Styles` and calls out to `PrettyText.strip_secure_media` to do the actual stripping and replacing with placeholders. `app/mailers/group_smtp_mailer.rb` and `app/mailers/user_notifications.rb` no longer do any stripping because they are earlier in the pipeline than `Email::Styles`. Finally the redaction notice has been restyled and includes a link to the media that the user can click, which will show it to them if they have the necessary permissions. ![image](https://user-images.githubusercontent.com/920448/92341012-b9a2c380-f0ff-11ea-860e-b376b4528357.png)
2020-09-09 19:50:16 -04:00
img.remove
end
FEATURE: Allow email image embed with secure media (#10563) This PR introduces a few important changes to secure media redaction in emails. First of all, two new site settings have been introduced: * `secure_media_allow_embed_images_in_emails`: If enabled we will embed secure images in emails instead of redacting them. * `secure_media_max_email_embed_image_size_kb`: The cap to the size of the secure image we will embed, defaulting to 1mb, so the email does not become too big. Max is 10mb. Works in tandem with `email_total_attachment_size_limit_kb`. `Email::Sender` will now attach images to the email based on these settings. The sender will also call `inline_secure_images` in `Email::Styles` after secure media is redacted and attachments are added to replace redaction messages with attached images. I went with attachment and `cid` URLs because base64 image support is _still_ flaky in email clients. All redaction of secure media is now handled in `Email::Styles` and calls out to `PrettyText.strip_secure_media` to do the actual stripping and replacing with placeholders. `app/mailers/group_smtp_mailer.rb` and `app/mailers/user_notifications.rb` no longer do any stripping because they are earlier in the pipeline than `Email::Styles`. Finally the redaction notice has been restyled and includes a link to the media that the user can click, which will show it to them if they have the necessary permissions. ![image](https://user-images.githubusercontent.com/920448/92341012-b9a2c380-f0ff-11ea-860e-b376b4528357.png)
2020-09-09 19:50:16 -04:00
end
end
def self.secure_uploads_placeholder(doc, url, onebox_type: false, width: nil, height: nil)
data_width = width ? "data-width=#{width}" : ""
data_height = height ? "data-height=#{height}" : ""
data_onebox_type = onebox_type ? "data-onebox-type='#{onebox_type}'" : ""
FEATURE: Allow email image embed with secure media (#10563) This PR introduces a few important changes to secure media redaction in emails. First of all, two new site settings have been introduced: * `secure_media_allow_embed_images_in_emails`: If enabled we will embed secure images in emails instead of redacting them. * `secure_media_max_email_embed_image_size_kb`: The cap to the size of the secure image we will embed, defaulting to 1mb, so the email does not become too big. Max is 10mb. Works in tandem with `email_total_attachment_size_limit_kb`. `Email::Sender` will now attach images to the email based on these settings. The sender will also call `inline_secure_images` in `Email::Styles` after secure media is redacted and attachments are added to replace redaction messages with attached images. I went with attachment and `cid` URLs because base64 image support is _still_ flaky in email clients. All redaction of secure media is now handled in `Email::Styles` and calls out to `PrettyText.strip_secure_media` to do the actual stripping and replacing with placeholders. `app/mailers/group_smtp_mailer.rb` and `app/mailers/user_notifications.rb` no longer do any stripping because they are earlier in the pipeline than `Email::Styles`. Finally the redaction notice has been restyled and includes a link to the media that the user can click, which will show it to them if they have the necessary permissions. ![image](https://user-images.githubusercontent.com/920448/92341012-b9a2c380-f0ff-11ea-860e-b376b4528357.png)
2020-09-09 19:50:16 -04:00
<<~HTML
<div class="secure-upload-notice" data-stripped-secure-upload="#{url}" #{data_onebox_type} #{data_width} #{data_height}>
#{I18n.t("emails.secure_uploads_placeholder")} <a class='stripped-secure-view-upload' href="#{url}">#{I18n.t("emails.view_redacted_media")}</a>.
FEATURE: Allow email image embed with secure media (#10563) This PR introduces a few important changes to secure media redaction in emails. First of all, two new site settings have been introduced: * `secure_media_allow_embed_images_in_emails`: If enabled we will embed secure images in emails instead of redacting them. * `secure_media_max_email_embed_image_size_kb`: The cap to the size of the secure image we will embed, defaulting to 1mb, so the email does not become too big. Max is 10mb. Works in tandem with `email_total_attachment_size_limit_kb`. `Email::Sender` will now attach images to the email based on these settings. The sender will also call `inline_secure_images` in `Email::Styles` after secure media is redacted and attachments are added to replace redaction messages with attached images. I went with attachment and `cid` URLs because base64 image support is _still_ flaky in email clients. All redaction of secure media is now handled in `Email::Styles` and calls out to `PrettyText.strip_secure_media` to do the actual stripping and replacing with placeholders. `app/mailers/group_smtp_mailer.rb` and `app/mailers/user_notifications.rb` no longer do any stripping because they are earlier in the pipeline than `Email::Styles`. Finally the redaction notice has been restyled and includes a link to the media that the user can click, which will show it to them if they have the necessary permissions. ![image](https://user-images.githubusercontent.com/920448/92341012-b9a2c380-f0ff-11ea-860e-b376b4528357.png)
2020-09-09 19:50:16 -04:00
</div>
HTML
end
def self.format_for_email(html, post = nil)
doc = Nokogiri::HTML5.fragment(html)
DiscourseEvent.trigger(:reduce_cooked, doc, post)
strip_secure_uploads(doc) if post&.with_secure_uploads?
strip_image_wrapping(doc)
convert_vimeo_iframes(doc)
make_all_links_absolute(doc)
doc.to_html
end
2013-05-27 19:48:47 -04:00
protected
2013-02-05 14:16:51 -05:00
class JavaScriptError < StandardError
attr_accessor :message, :backtrace
def initialize(message, backtrace)
@message = message
@backtrace = backtrace
end
end
def self.protect
rval = nil
@mutex.synchronize { rval = yield }
rval
end
def self.ctx_load(ctx, *files)
files.each { |file| ctx.load(app_root + file) }
2013-02-05 14:16:51 -05:00
end
private
USER_TYPE ||= "user"
GROUP_TYPE ||= "group"
GROUP_MENTIONABLE_TYPE ||= "group-mentionable"
def self.add_mentions(doc, user_id: nil)
elements = doc.css("span.mention")
2018-11-22 19:31:52 -05:00
names = elements.map { |element| element.text[1..-1] }
mentions = lookup_mentions(names, user_id: user_id)
elements.each do |element|
name = element.text[1..-1]
name.downcase!
if type = mentions[name]
element.name = "a"
element.children = PrettyText::Helpers.format_username(element.children.text)
case type
when USER_TYPE
element["href"] = "#{Discourse.base_path}/u/#{UrlHelper.encode_component(name)}"
when GROUP_MENTIONABLE_TYPE
element["class"] = "mention-group notify"
element["href"] = "#{Discourse.base_path}/groups/#{UrlHelper.encode_component(name)}"
when GROUP_TYPE
element["class"] = "mention-group"
element["href"] = "#{Discourse.base_path}/groups/#{UrlHelper.encode_component(name)}"
end
end
end
end
def self.lookup_mentions(names, user_id: nil)
return {} if names.blank?
sql = <<~SQL
(
SELECT
:user_type AS type,
username_lower AS name
FROM users
WHERE username_lower IN (:names) AND staged = false
)
UNION
(
SELECT
:group_type AS type,
lower(name) AS name
FROM groups
)
UNION
(
SELECT
:group_mentionable_type AS type,
lower(name) AS name
FROM groups
WHERE lower(name) IN (:names) AND (#{Group.mentionable_sql_clause(include_public: false)})
)
ORDER BY type
SQL
user = User.find_by(id: user_id)
names.each(&:downcase!)
results =
DB.query(
sql,
names: names,
user_type: USER_TYPE,
group_type: GROUP_TYPE,
group_mentionable_type: GROUP_MENTIONABLE_TYPE,
levels: Group.alias_levels(user),
user_id: user_id,
)
mentions = {}
results.each { |result| mentions[result.name] = result.type }
mentions
end
def self.allowed_src_pattern
allowed_src_prefixes = [
Discourse.base_path,
Discourse.base_url,
GlobalSetting.s3_cdn_url,
GlobalSetting.cdn_url,
SiteSetting.external_emoji_url.presence,
*SiteSetting.block_hotlinked_media_exceptions.split("|"),
]
patterns =
allowed_src_prefixes.compact.map do |url|
pattern = Regexp.escape(url)
# If 'https://example.com' is allowed, ensure 'https://example.com.blah.com' is not
pattern += '(?:/|\z)' if !pattern.ends_with?("\/")
pattern
end
/\A(data:|#{patterns.join("|")})/
end
2013-02-05 14:16:51 -05:00
end