From 9bff0882c3df594850de68ade2697407c84dcd7b Mon Sep 17 00:00:00 2001 From: Krzysztof Kotlarek Date: Tue, 5 May 2020 13:46:57 +1000 Subject: [PATCH] FEATURE: Nokogumbo (#9577) * FEATURE: Nokogumbo Use Nokogumbo HTML parser. --- app/helpers/user_notifications_helper.rb | 4 +- app/jobs/onceoff/grant_emoji.rb | 2 +- app/jobs/onceoff/grant_onebox.rb | 2 +- app/jobs/regular/pull_hotlinked_images.rb | 2 +- app/jobs/regular/update_username.rb | 4 +- app/models/category.rb | 2 +- app/models/post.rb | 2 +- app/models/post_analyzer.rb | 2 +- app/models/quoted_post.rb | 2 +- app/models/theme_field.rb | 2 +- app/models/topic_embed.rb | 8 +- app/services/inline_uploads.rb | 6 +- app/services/search_indexer.rb | 2 +- ...3951_backfill_post_upload_reverse_index.rb | 2 +- db/migrate/20140715055242_add_quoted_posts.rb | 2 +- lib/content_security_policy/extension.rb | 3 +- lib/cooked_post_processor.rb | 4 +- lib/discourse_diff.rb | 2 +- lib/email/receiver.rb | 2 +- lib/email/styles.rb | 10 +- .../engine/whitelisted_generic_onebox.rb | 2 +- lib/oneboxer.rb | 6 +- lib/post_revisor.rb | 2 +- lib/pretty_text.rb | 26 +++-- lib/quote_comparer.rb | 2 +- lib/retrieve_title.rb | 2 +- lib/reviewable/conversation.rb | 2 +- lib/tasks/emoji.rake | 2 +- .../spec/components/pretty_text_spec.rb | 4 +- .../lib/discourse_narrative_bot/actions.rb | 2 +- .../advanced_user_narrative.rb | 6 +- .../new_user_narrative.rb | 8 +- plugins/poll/plugin.rb | 2 +- plugins/poll/spec/lib/pretty_text_spec.rb | 2 +- script/import_scripts/ipboard3.rb | 2 +- script/import_scripts/jive.rb | 2 +- script/import_scripts/jive_api.rb | 2 +- script/import_scripts/lithium.rb | 2 +- spec/components/cooked_post_processor_spec.rb | 107 ++++++++---------- spec/components/email/styles_spec.rb | 4 +- spec/components/excerpt_parser_spec.rb | 2 +- spec/components/pretty_text_spec.rb | 32 +++--- spec/lib/content_security_policy_spec.rb | 6 +- spec/models/topic_embed_spec.rb | 4 +- spec/requests/categories_controller_spec.rb | 2 +- spec/requests/email_controller_spec.rb | 6 +- spec/requests/embed_controller_spec.rb | 2 +- .../requests/user_api_keys_controller_spec.rb | 2 +- spec/services/username_changer_spec.rb | 34 +++--- spec/support/match_html_matcher.rb | 2 +- 50 files changed, 165 insertions(+), 179 deletions(-) diff --git a/app/helpers/user_notifications_helper.rb b/app/helpers/user_notifications_helper.rb index eb0293183b2..ffbf7352c2a 100644 --- a/app/helpers/user_notifications_helper.rb +++ b/app/helpers/user_notifications_helper.rb @@ -13,7 +13,7 @@ module UserNotificationsHelper end def correct_top_margin(html, desired) - fragment = Nokogiri::HTML.fragment(html) + fragment = Nokogiri::HTML5.fragment(html) if para = fragment.css("p:first").first para["style"] = "margin-top: #{desired};" end @@ -32,7 +32,7 @@ module UserNotificationsHelper end def first_paragraphs_from(html) - doc = Nokogiri::HTML(html) + doc = Nokogiri::HTML5(html) result = +"" length = 0 diff --git a/app/jobs/onceoff/grant_emoji.rb b/app/jobs/onceoff/grant_emoji.rb index 5f85b431b82..5abdb34d5d4 100644 --- a/app/jobs/onceoff/grant_emoji.rb +++ b/app/jobs/onceoff/grant_emoji.rb @@ -14,7 +14,7 @@ module Jobs .where("cooked LIKE '%emoji%'") .find_in_batches do |group| group.each do |p| - doc = Nokogiri::HTML::fragment(p.cooked) + doc = Nokogiri::HTML5::fragment(p.cooked) if (doc.css("img.emoji") - doc.css(".quote img")).size > 0 to_award[p.user_id] ||= { post_id: p.id, created_at: p.created_at } end diff --git a/app/jobs/onceoff/grant_onebox.rb b/app/jobs/onceoff/grant_onebox.rb index 59cf443f4e5..66d2cf26706 100644 --- a/app/jobs/onceoff/grant_onebox.rb +++ b/app/jobs/onceoff/grant_onebox.rb @@ -19,7 +19,7 @@ module Jobs begin # Note we can't use `p.cooked` here because oneboxes have been cooked out cooked = PrettyText.cook(p.raw) - doc = Nokogiri::HTML::fragment(cooked) + doc = Nokogiri::HTML5::fragment(cooked) if doc.search('a.onebox').size > 0 to_award[p.user_id] ||= { post_id: p.id, created_at: p.created_at } end diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb index 25703659460..e7644e8ca84 100644 --- a/app/jobs/regular/pull_hotlinked_images.rb +++ b/app/jobs/regular/pull_hotlinked_images.rb @@ -157,7 +157,7 @@ module Jobs end def extract_images_from(html) - doc = Nokogiri::HTML::fragment(html) + doc = Nokogiri::HTML5::fragment(html) doc.css("img[src], a.lightbox[href]") - doc.css("img.avatar") - diff --git a/app/jobs/regular/update_username.rb b/app/jobs/regular/update_username.rb index d43c119060d..7c9fcedb5a2 100644 --- a/app/jobs/regular/update_username.rb +++ b/app/jobs/regular/update_username.rb @@ -154,11 +154,11 @@ module Jobs # and there is no reason to invalidate oneboxes, run the post analyzer etc. # when only the username changes. def update_cooked(cooked) - doc = Nokogiri::HTML.fragment(cooked) + doc = Nokogiri::HTML5.fragment(cooked) doc.css("a.mention").each do |a| a.content = a.content.gsub(@cooked_mention_username_regex, "@#{@new_username}") - a["href"] = a["href"].gsub(@cooked_mention_user_path_regex, "/u/#{@new_username}") if a["href"] + a["href"] = a["href"].gsub(@cooked_mention_user_path_regex, "/u/#{URI.escape(@new_username)}") if a["href"] end doc.css("aside.quote").each do |aside| diff --git a/app/models/category.rb b/app/models/category.rb index bbf1e9b98cb..ddab403c5d2 100644 --- a/app/models/category.rb +++ b/app/models/category.rb @@ -306,7 +306,7 @@ class Category < ActiveRecord::Base @@cache_text ||= LruRedux::ThreadSafeCache.new(1000) @@cache_text.getset(self.description) do - text = Nokogiri::HTML.fragment(self.description).text.strip + text = Nokogiri::HTML5.fragment(self.description).text.strip Rack::Utils.escape_html(text).html_safe end end diff --git a/app/models/post.rb b/app/models/post.rb index 5da5d163200..f2ee10543f3 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -953,7 +953,7 @@ class Post < ActiveRecord::Base /\/uploads\/short-url\/[a-zA-Z0-9]+(\.[a-z0-9]+)?/ ] - fragments ||= Nokogiri::HTML::fragment(self.cooked) + fragments ||= Nokogiri::HTML5::fragment(self.cooked) selectors = fragments.css("a/@href", "img/@src", "source/@src", "track/@src", "video/@poster") links = selectors.map do |media| diff --git a/app/models/post_analyzer.rb b/app/models/post_analyzer.rb index 63fe9724b4d..bae36c39f86 100644 --- a/app/models/post_analyzer.rb +++ b/app/models/post_analyzer.rb @@ -131,7 +131,7 @@ class PostAnalyzer def cooked_stripped @cooked_stripped ||= begin - doc = Nokogiri::HTML.fragment(cook(@raw, topic_id: @topic_id)) + doc = Nokogiri::HTML5.fragment(cook(@raw, topic_id: @topic_id)) doc.css("pre .mention, aside.quote > .title, aside.quote .mention, aside.quote .mention-group, .onebox, .elided").remove doc end diff --git a/app/models/quoted_post.rb b/app/models/quoted_post.rb index 03b981e1fb3..9a6a96e9ebf 100644 --- a/app/models/quoted_post.rb +++ b/app/models/quoted_post.rb @@ -9,7 +9,7 @@ class QuotedPost < ActiveRecord::Base # we are double parsing this fragment, this may be worth optimising later def self.extract_from(post) - doc = Nokogiri::HTML.fragment(post.cooked) + doc = Nokogiri::HTML5.fragment(post.cooked) uniq = {} diff --git a/app/models/theme_field.rb b/app/models/theme_field.rb index 351c7b90f1a..a9f98ce4078 100644 --- a/app/models/theme_field.rb +++ b/app/models/theme_field.rb @@ -78,7 +78,7 @@ class ThemeField < ActiveRecord::Base js_compiler = ThemeJavascriptCompiler.new(theme_id, self.theme.name) - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) doc.css('script[type="text/x-handlebars"]').each do |node| name = node["name"] || node["data-template-name"] || "broken" diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb index 43de234a159..a18e5b85dea 100644 --- a/app/models/topic_embed.rb +++ b/app/models/topic_embed.rb @@ -126,7 +126,7 @@ class TopicEmbed < ActiveRecord::Base return end - raw_doc = Nokogiri::HTML(html) + raw_doc = Nokogiri::HTML5(html) auth_element = raw_doc.at('meta[@name="author"]') if auth_element.present? response.author = User.where(username_lower: auth_element[:content].strip).first @@ -142,7 +142,7 @@ class TopicEmbed < ActiveRecord::Base title.strip! end response.title = title - doc = Nokogiri::HTML(read_doc.content) + doc = Nokogiri::HTML5(read_doc.content) tags = { 'img' => 'src', 'script' => 'src', 'a' => 'href' } doc.search(tags.keys.join(',')).each do |node| @@ -198,7 +198,7 @@ class TopicEmbed < ActiveRecord::Base prefix = "#{uri.scheme}://#{uri.host}" prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443 - fragment = Nokogiri::HTML.fragment("
#{contents}
") + fragment = Nokogiri::HTML5.fragment("
#{contents}
") fragment.css('a').each do |a| href = a['href'] if href.present? && href.start_with?('/') @@ -220,7 +220,7 @@ class TopicEmbed < ActiveRecord::Base end def self.first_paragraph_from(html) - doc = Nokogiri::HTML(html) + doc = Nokogiri::HTML5(html) result = +"" doc.css('p').each do |p| diff --git a/app/services/inline_uploads.rb b/app/services/inline_uploads.rb index e75348bccee..0facc329fba 100644 --- a/app/services/inline_uploads.rb +++ b/app/services/inline_uploads.rb @@ -16,7 +16,7 @@ class InlineUploads end end - cooked_fragment = Nokogiri::HTML::fragment(PrettyText.cook(markdown, disable_emojis: true)) + cooked_fragment = Nokogiri::HTML5::fragment(PrettyText.cook(markdown, disable_emojis: true)) link_occurences = [] cooked_fragment.traverse do |node| @@ -183,7 +183,7 @@ class InlineUploads def self.match_anchor(markdown, external_href: false) markdown.scan(/(()([^<\a>]*?)<\/a>)/i) do |match| - node = Nokogiri::HTML::fragment(match[0]).children[0] + node = Nokogiri::HTML5::fragment(match[0]).children[0] href = node.attributes["href"]&.value if href && (matched_uploads(href).present? || external_href) @@ -199,7 +199,7 @@ class InlineUploads def self.match_img(markdown, external_src: false) markdown.scan(/(<(?!img)[^<>]+\/?>)?(\s*)(\n]+>)/i) do |match| - node = Nokogiri::HTML::fragment(match[2].strip).children[0] + node = Nokogiri::HTML5::fragment(match[2].strip).children[0] src = node.attributes["src"]&.value if src && (matched_uploads(src).present? || external_src) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 17817c4c099..ceeaaf27f25 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -191,7 +191,7 @@ class SearchIndexer def self.scrub(html, strip_diacritics: false) return +"" if html.blank? - document = Nokogiri::HTML("
#{html}
", nil, Encoding::UTF_8.to_s) + document = Nokogiri::HTML5("
#{html}
", nil, Encoding::UTF_8.to_s) nodes = document.css( "div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}" diff --git a/db/migrate/20131014203951_backfill_post_upload_reverse_index.rb b/db/migrate/20131014203951_backfill_post_upload_reverse_index.rb index 793da0ce64a..0c5f4bb3e67 100644 --- a/db/migrate/20131014203951_backfill_post_upload_reverse_index.rb +++ b/db/migrate/20131014203951_backfill_post_upload_reverse_index.rb @@ -8,7 +8,7 @@ class BackfillPostUploadReverseIndex < ActiveRecord::Migration[4.2] # fill the reverse index up Post.select([:id, :cooked]).find_each do |post| - doc = Nokogiri::HTML::fragment(post.cooked) + doc = Nokogiri::HTML5::fragment(post.cooked) # images doc.search("img").each { |img| add_to_reverse_index(img['src'], post.id) } # thumbnails and/or attachments diff --git a/db/migrate/20140715055242_add_quoted_posts.rb b/db/migrate/20140715055242_add_quoted_posts.rb index 47550f92513..d3052b8a07c 100644 --- a/db/migrate/20140715055242_add_quoted_posts.rb +++ b/db/migrate/20140715055242_add_quoted_posts.rb @@ -30,7 +30,7 @@ SQL results.each do |row| post_id, max_id = row["id"].to_i - doc = Nokogiri::HTML.fragment(row["cooked"]) + doc = Nokogiri::HTML5.fragment(row["cooked"]) uniq = {} diff --git a/lib/content_security_policy/extension.rb b/lib/content_security_policy/extension.rb index 4c8231b60a1..93eab088e41 100644 --- a/lib/content_security_policy/extension.rb +++ b/lib/content_security_policy/extension.rb @@ -61,7 +61,8 @@ class ContentSecurityPolicy auto_script_src_extension = { script_src: [] } html_fields.each(&:ensure_baked!) doc = html_fields.map(&:value_baked).join("\n") - Nokogiri::HTML.fragment(doc).css('script[src]').each do |node| + + Nokogiri::HTML5.fragment(doc).css('script[src]').each do |node| src = node['src'] uri = URI(src) diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb index a2d75bfdf9e..725258cc10d 100644 --- a/lib/cooked_post_processor.rb +++ b/lib/cooked_post_processor.rb @@ -24,7 +24,7 @@ class CookedPostProcessor @cooking_options = @cooking_options.symbolize_keys cooked = post.cook(post.raw, @cooking_options) - @doc = Nokogiri::HTML::fragment(cooked) + @doc = Nokogiri::HTML5::fragment(cooked) @has_oneboxes = post.post_analyzer.found_oneboxes? @size_cache = {} @@ -95,7 +95,7 @@ class CookedPostProcessor return if previous.blank? - previous_text = Nokogiri::HTML::fragment(previous).text.strip + previous_text = Nokogiri::HTML5::fragment(previous).text.strip quoted_text = @doc.css("aside.quote:first-child blockquote").first&.text&.strip || "" return if previous_text.gsub(/(\s){2,}/, '\1') != quoted_text.gsub(/(\s){2,}/, '\1') diff --git a/lib/discourse_diff.rb b/lib/discourse_diff.rb index c2b31716237..cdf37870201 100644 --- a/lib/discourse_diff.rb +++ b/lib/discourse_diff.rb @@ -168,7 +168,7 @@ class DiscourseDiff end def tokenize_html_blocks(html) - Nokogiri::HTML.fragment(html).search("./*").map(&:to_html) + Nokogiri::HTML5.fragment(html).search("./*").map(&:to_html) end def tokenize_html(html) diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 69f7ccf2704..3cb73a8976d 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -338,7 +338,7 @@ module Email markdown, elided_markdown = if html.present? # use the first html extracter that matches if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r } - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) self.public_send(:"extract_from_#{html_extracter[0]}", doc) else markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown diff --git a/lib/email/styles.rb b/lib/email/styles.rb index 955398aad8c..69bfc7ec233 100644 --- a/lib/email/styles.rb +++ b/lib/email/styles.rb @@ -15,7 +15,7 @@ module Email def initialize(html, opts = nil) @html = html @opts = opts || {} - @fragment = Nokogiri::HTML.fragment(@html) + @fragment = Nokogiri::HTML5.parse(@html) @custom_styles = nil end @@ -161,7 +161,7 @@ module Email src_uri = i["data-original-href"].present? ? URI(i["data-original-href"]) : URI(i['src']) # If an iframe is protocol relative, use SSL when displaying it display_src = "#{src_uri.scheme || 'https'}://#{src_uri.host}#{src_uri.path}#{src_uri.query.nil? ? '' : '?' + src_uri.query}#{src_uri.fragment.nil? ? '' : '#' + src_uri.fragment}" - i.replace "

#{CGI.escapeHTML(display_src)}

" + i.replace(Nokogiri::HTML5.fragment("

#{CGI.escapeHTML(display_src)}

")) rescue URI::Error # If the URL is weird, remove the iframe i.remove @@ -242,7 +242,11 @@ module Email strip_classes_and_ids replace_relative_urls replace_secure_media_urls - @fragment.to_html + include_body? ? @fragment.at("body").to_html : @fragment.at("body").children.to_html + end + + def include_body? + @html =~ //i end def strip_avatars_and_emojis diff --git a/lib/onebox/engine/whitelisted_generic_onebox.rb b/lib/onebox/engine/whitelisted_generic_onebox.rb index a10f22e83b4..7a46a0d1e5d 100644 --- a/lib/onebox/engine/whitelisted_generic_onebox.rb +++ b/lib/onebox/engine/whitelisted_generic_onebox.rb @@ -24,7 +24,7 @@ module Onebox return true if WhitelistedGenericOnebox.html_providers.include?(data[:provider_name]) if data[:html]["iframe"] - fragment = Nokogiri::HTML::fragment(data[:html]) + fragment = Nokogiri::HTML5::fragment(data[:html]) if iframe = fragment.at_css("iframe") src = iframe["src"] return src.present? && SiteSetting.allowed_iframes.split("|").any? { |url| src.start_with?(url) } diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 4e3ab68cf8d..d3625f19d8b 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -78,7 +78,7 @@ module Oneboxer # Parse URLs out of HTML, returning the document when finished. def self.each_onebox_link(string_or_doc, extra_paths: []) doc = string_or_doc - doc = Nokogiri::HTML::fragment(doc) if doc.is_a?(String) + doc = Nokogiri::HTML5::fragment(doc) if doc.is_a?(String) onebox_links = doc.css("a.#{ONEBOX_CSS_CLASS}", *extra_paths) if onebox_links.present? @@ -94,14 +94,14 @@ module Oneboxer def self.apply(string_or_doc, extra_paths: nil) doc = string_or_doc - doc = Nokogiri::HTML::fragment(doc) if doc.is_a?(String) + doc = Nokogiri::HTML5::fragment(doc) if doc.is_a?(String) changed = false each_onebox_link(doc, extra_paths: extra_paths) do |url, element| onebox, _ = yield(url, element) if onebox - parsed_onebox = Nokogiri::HTML::fragment(onebox) + parsed_onebox = Nokogiri::HTML5::fragment(onebox) next unless parsed_onebox.children.count > 0 if element&.parent&.node_name&.downcase == "p" && diff --git a/lib/post_revisor.rb b/lib/post_revisor.rb index 9da306db091..fd85696cf06 100644 --- a/lib/post_revisor.rb +++ b/lib/post_revisor.rb @@ -579,7 +579,7 @@ class PostRevisor def update_category_description return unless category = Category.find_by(topic_id: @topic.id) - doc = Nokogiri::HTML.fragment(@post.cooked) + doc = Nokogiri::HTML5.fragment(@post.cooked) doc.css("img").remove if html = doc.css("p").first&.inner_html&.strip diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb index 73c18d0099c..9f3f65e3507 100644 --- a/lib/pretty_text.rb +++ b/lib/pretty_text.rb @@ -259,7 +259,7 @@ module PrettyText sanitized = markdown(working_text, options) - doc = Nokogiri::HTML.fragment(sanitized) + doc = Nokogiri::HTML5.fragment(sanitized) if !options[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content add_rel_nofollow_to_user_content(doc) @@ -269,7 +269,11 @@ module PrettyText add_mentions(doc, user_id: opts[:user_id]) end - doc.to_html + scrubber = Loofah::Scrubber.new do |node| + node.remove if node.name == 'script' + end + loofah_fragment = Loofah.fragment(doc.to_html) + loofah_fragment.scrub!(scrubber).to_html end def self.add_rel_nofollow_to_user_content(doc) @@ -282,7 +286,7 @@ module PrettyText doc.css("a").each do |l| href = l["href"].to_s begin - uri = URI(href) + uri = URI(URI.escape(href)) site_uri ||= URI(Discourse.base_url) if !uri.host.present? || @@ -305,7 +309,7 @@ module PrettyText def self.extract_links(html) links = [] - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) # remove href inside quotes & elided part doc.css("aside.quote a, .elided a").each { |a| a["href"] = "" } @@ -338,7 +342,7 @@ module PrettyText def self.excerpt(html, max_length, options = {}) # TODO: properly fix this HACK in ExcerptParser without introducing XSS - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) DiscourseEvent.trigger(:reduce_excerpt, doc, options) strip_image_wrapping(doc) strip_oneboxed_media(doc) @@ -350,7 +354,7 @@ module PrettyText return string if string.blank? # If the user is not basic, strip links from their bio - fragment = Nokogiri::HTML.fragment(string) + fragment = Nokogiri::HTML5.fragment(string) fragment.css('a').each { |a| a.replace(a.inner_html) } fragment.to_html end @@ -395,14 +399,14 @@ module PrettyText def self.strip_secure_media(doc) doc.css("a[href]").each do |a| if Upload.secure_media_url?(a["href"]) - target = %w(video audio).include?(a&.parent&.parent&.name) ? a.parent.parent : a + target = %w(video audio).include?(a&.parent&.name) ? a.parent : a target.replace "

#{I18n.t("emails.secure_media_placeholder")}

" end end end def self.format_for_email(html, post = nil) - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) DiscourseEvent.trigger(:reduce_cooked, doc, post) strip_secure_media(doc) if post&.with_secure_media? strip_image_wrapping(doc) @@ -462,13 +466,13 @@ module PrettyText case type when USER_TYPE - element['href'] = "#{Discourse::base_uri}/u/#{name}" + element['href'] = "#{Discourse::base_uri}/u/#{URI.escape(name)}" when GROUP_MENTIONABLE_TYPE element['class'] = 'mention-group notify' - element['href'] = "#{Discourse::base_uri}/groups/#{name}" + element['href'] = "#{Discourse::base_uri}/groups/#{URI.escape(name)}" when GROUP_TYPE element['class'] = 'mention-group' - element['href'] = "#{Discourse::base_uri}/groups/#{name}" + element['href'] = "#{Discourse::base_uri}/groups/#{URI.escape(name)}" end end end diff --git a/lib/quote_comparer.rb b/lib/quote_comparer.rb index 5da2891a7e1..74f39f84a77 100644 --- a/lib/quote_comparer.rb +++ b/lib/quote_comparer.rb @@ -18,7 +18,7 @@ class QuoteComparer def modified? return true if @text.blank? || @parent_post.blank? - parent_text = Nokogiri::HTML::fragment(@parent_post.cooked).text.delete(QuoteComparer.whitespace) + parent_text = Nokogiri::HTML5::fragment(@parent_post.cooked).text.delete(QuoteComparer.whitespace) text = @text.delete(QuoteComparer.whitespace) !parent_text.include?(text) diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 3c55ec7aecf..227da9f0cbd 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -11,7 +11,7 @@ module RetrieveTitle def self.extract_title(html) title = nil - if doc = Nokogiri::HTML(html) + if doc = Nokogiri::HTML5(html) title = doc.at('title')&.inner_text diff --git a/lib/reviewable/conversation.rb b/lib/reviewable/conversation.rb index 53eba48ef1a..696959dd922 100644 --- a/lib/reviewable/conversation.rb +++ b/lib/reviewable/conversation.rb @@ -17,7 +17,7 @@ class Reviewable < ActiveRecord::Base def self.excerpt(cooked) excerpt = ::Post.excerpt(cooked, 250, keep_emoji_images: true) # remove the first link if it's the first node - fragment = Nokogiri::HTML.fragment(excerpt) + fragment = Nokogiri::HTML5.fragment(excerpt) if fragment.children.first == fragment.css("a:first").first && fragment.children.first fragment.children.first.remove end diff --git a/lib/tasks/emoji.rake b/lib/tasks/emoji.rake index ad05c0d1b24..ace173d55c2 100644 --- a/lib/tasks/emoji.rake +++ b/lib/tasks/emoji.rake @@ -353,7 +353,7 @@ def generate_emoji_groups(keywords, sections) puts "Generating groups..." list = open(EMOJI_ORDERING_URL).read - doc = Nokogiri::HTML(list) + doc = Nokogiri::HTML5(list) table = doc.css("table")[0] EMOJI_GROUPS.map do |group| diff --git a/plugins/discourse-details/spec/components/pretty_text_spec.rb b/plugins/discourse-details/spec/components/pretty_text_spec.rb index aa768305ecf..dd0cf14a722 100644 --- a/plugins/discourse-details/spec/components/pretty_text_spec.rb +++ b/plugins/discourse-details/spec/components/pretty_text_spec.rb @@ -8,7 +8,7 @@ describe PrettyText do let(:post) { Fabricate(:post) } it "supports details tag" do - cooked_html = <<~HTML + cooked_html = <<~HTML.gsub("\n", "")
foo @@ -17,7 +17,7 @@ describe PrettyText do HTML expect(cooked_html).to match_html(cooked_html) - expect(PrettyText.cook("[details=foo]\nbar\n[/details]")).to match_html(cooked_html) + expect(PrettyText.cook("[details=foo]\nbar\n[/details]").gsub("\n", "")).to match_html(cooked_html) end it "deletes elided content" do diff --git a/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/actions.rb b/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/actions.rb index 342e1491284..56afacf2af7 100644 --- a/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/actions.rb +++ b/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/actions.rb @@ -68,7 +68,7 @@ module DiscourseNarrativeBot end def bot_mentioned?(post) - doc = Nokogiri::HTML.fragment(post.cooked) + doc = Nokogiri::HTML5.fragment(post.cooked) valid = false diff --git a/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/advanced_user_narrative.rb b/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/advanced_user_narrative.rb index aa66cb1802a..52f1cdab65a 100644 --- a/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/advanced_user_narrative.rb +++ b/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/advanced_user_narrative.rb @@ -280,7 +280,7 @@ module DiscourseNarrativeBot topic_id = @post.topic_id return unless valid_topic?(topic_id) - if Nokogiri::HTML.fragment(@post.cooked).css('.hashtag').size > 0 + if Nokogiri::HTML5.fragment(@post.cooked).css('.hashtag').size > 0 raw = <<~RAW #{I18n.t("#{I18N_KEY}.category_hashtag.reply", i18n_post_args)} @@ -331,7 +331,7 @@ module DiscourseNarrativeBot topic_id = @post.topic_id return unless valid_topic?(topic_id) - if Nokogiri::HTML.fragment(@post.cooked).css(".poll").size > 0 + if Nokogiri::HTML5.fragment(@post.cooked).css(".poll").size > 0 raw = <<~RAW #{I18n.t("#{I18N_KEY}.poll.reply", i18n_post_args)} @@ -354,7 +354,7 @@ module DiscourseNarrativeBot fake_delay - if Nokogiri::HTML.fragment(@post.cooked).css("details").size > 0 + if Nokogiri::HTML5.fragment(@post.cooked).css("details").size > 0 reply_to(@post, I18n.t("#{I18N_KEY}.details.reply", i18n_post_args)) else reply_to(@post, I18n.t("#{I18N_KEY}.details.not_found", i18n_post_args)) unless @data[:attempted] diff --git a/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/new_user_narrative.rb b/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/new_user_narrative.rb index 5e9a1f38a54..9898ba085e8 100644 --- a/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/new_user_narrative.rb +++ b/plugins/discourse-narrative-bot/lib/discourse_narrative_bot/new_user_narrative.rb @@ -326,7 +326,7 @@ module DiscourseNarrativeBot cooked = @post.post_analyzer.cook(@post.raw, {}) - if Nokogiri::HTML.fragment(cooked).css("img").size > 0 + if Nokogiri::HTML5.fragment(cooked).css("img").size > 0 set_state_data(:post_id, @post.id) if get_state_data(:liked) @@ -366,7 +366,7 @@ module DiscourseNarrativeBot post_topic_id = @post.topic_id return unless valid_topic?(post_topic_id) - if Nokogiri::HTML.fragment(@post.cooked).css("b", "strong", "em", "i", ".bbcode-i", ".bbcode-b").size > 0 + if Nokogiri::HTML5.fragment(@post.cooked).css("b", "strong", "em", "i", ".bbcode-i", ".bbcode-b").size > 0 raw = <<~RAW #{I18n.t("#{I18N_KEY}.formatting.reply", i18n_post_args)} @@ -390,7 +390,7 @@ module DiscourseNarrativeBot post_topic_id = @post.topic_id return unless valid_topic?(post_topic_id) - doc = Nokogiri::HTML.fragment(@post.cooked) + doc = Nokogiri::HTML5.fragment(@post.cooked) if doc.css(".quote").size > 0 raw = <<~RAW @@ -416,7 +416,7 @@ module DiscourseNarrativeBot post_topic_id = @post.topic_id return unless valid_topic?(post_topic_id) - doc = Nokogiri::HTML.fragment(@post.cooked) + doc = Nokogiri::HTML5.fragment(@post.cooked) if doc.css(".emoji").size > 0 raw = <<~RAW diff --git a/plugins/poll/plugin.rb b/plugins/poll/plugin.rb index e0c1057e36d..78fa0ba548a 100644 --- a/plugins/poll/plugin.rb +++ b/plugins/poll/plugin.rb @@ -350,7 +350,7 @@ after_initialize do # in the validators instead of cooking twice cooked = PrettyText.cook(raw, topic_id: topic_id, user_id: user_id) - Nokogiri::HTML(cooked).css("div.poll").map do |p| + Nokogiri::HTML5(cooked).css("div.poll").map do |p| poll = { "options" => [], "name" => DiscoursePoll::DEFAULT_POLL_NAME } # attributes diff --git a/plugins/poll/spec/lib/pretty_text_spec.rb b/plugins/poll/spec/lib/pretty_text_spec.rb index db8af253a8e..c4d09d2aacd 100644 --- a/plugins/poll/spec/lib/pretty_text_spec.rb +++ b/plugins/poll/spec/lib/pretty_text_spec.rb @@ -131,7 +131,7 @@ describe PrettyText do MD onebox = Oneboxer.onebox_raw(post.full_url, user_id: Fabricate(:user).id) - doc = Nokogiri::HTML(onebox[:preview]) + doc = Nokogiri::HTML5(onebox[:preview]) expect(onebox[:preview]).to include("A post with a poll") expect(onebox[:preview]).to include("poll") diff --git a/script/import_scripts/ipboard3.rb b/script/import_scripts/ipboard3.rb index 0791e2b3a00..4e7638633df 100644 --- a/script/import_scripts/ipboard3.rb +++ b/script/import_scripts/ipboard3.rb @@ -376,7 +376,7 @@ class ImportScripts::IPBoard3 < ImportScripts::Base raw.gsub!(/<(.+)> <\/\1>/, "\n\n") - doc = Nokogiri::HTML.fragment(raw) + doc = Nokogiri::HTML5.fragment(raw) doc.css("blockquote.ipsBlockquote").each do |bq| post_id = post_id_from_imported_post_id(bq["data-cid"]) diff --git a/script/import_scripts/jive.rb b/script/import_scripts/jive.rb index 8d386a52c58..f380b2bfee2 100644 --- a/script/import_scripts/jive.rb +++ b/script/import_scripts/jive.rb @@ -218,7 +218,7 @@ class ImportScripts::Jive < ImportScripts::Base raw = raw.dup raw = raw[5..-6] - doc = Nokogiri::HTML.fragment(raw) + doc = Nokogiri::HTML5.fragment(raw) doc.css('img').each do |img| img.remove if img['class'] == "jive-image" end diff --git a/script/import_scripts/jive_api.rb b/script/import_scripts/jive_api.rb index cf6df4d5bef..cee4928227d 100644 --- a/script/import_scripts/jive_api.rb +++ b/script/import_scripts/jive_api.rb @@ -297,7 +297,7 @@ class ImportScripts::JiveApi < ImportScripts::Base end def process_raw(raw) - doc = Nokogiri::HTML.fragment(raw) + doc = Nokogiri::HTML5.fragment(raw) # convert emoticon doc.css("span.emoticon-inline").each do |span| diff --git a/script/import_scripts/lithium.rb b/script/import_scripts/lithium.rb index ac18a3fa36f..161618a7c16 100644 --- a/script/import_scripts/lithium.rb +++ b/script/import_scripts/lithium.rb @@ -913,7 +913,7 @@ SQL raw.sub!(match, content) end - doc = Nokogiri::HTML.fragment(raw) + doc = Nokogiri::HTML5.fragment(raw) doc.css("a,img,li-image").each do |l| upload_name, image, linked_upload = [nil] * 3 diff --git a/spec/components/cooked_post_processor_spec.rb b/spec/components/cooked_post_processor_spec.rb index 47262f1342e..51e9eadf69f 100644 --- a/spec/components/cooked_post_processor_spec.rb +++ b/spec/components/cooked_post_processor_spec.rb @@ -453,10 +453,8 @@ describe CookedPostProcessor do it "generates overlay information" do cpp.post_process - expect(cpp.html).to match_html <<~HTML -

+ expect(cpp.html).to match_html <<~HTML.rstrip +

HTML expect(cpp).to be_dirty @@ -475,7 +473,7 @@ describe CookedPostProcessor do cpp.post_process - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

HTML end @@ -491,7 +489,7 @@ describe CookedPostProcessor do cpp.post_process - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

HTML end @@ -619,10 +617,8 @@ describe CookedPostProcessor do it "crops the image" do cpp.post_process - expect(cpp.html).to match_html <<~HTML -

+ expect(cpp.html).to match_html <<~HTML.rstrip +

HTML expect(cpp).to be_dirty @@ -652,10 +648,8 @@ describe CookedPostProcessor do it "generates overlay information" do cpp.post_process - expect(cpp.html). to match_html <<~HTML -

+ expect(cpp.html). to match_html <<~HTML.rstrip +

HTML expect(cpp).to be_dirty @@ -665,10 +659,8 @@ describe CookedPostProcessor do upload.update!(original_filename: ">.png") cpp.post_process - expect(cpp.html).to match_html <<~HTML -

+ expect(cpp.html).to match_html <<~HTML.rstrip +

HTML end @@ -693,10 +685,8 @@ describe CookedPostProcessor do it "generates overlay information using image title and ignores alt" do cpp.post_process - expect(cpp.html).to match_html <<~HTML -

+ expect(cpp.html).to match_html <<~HTML.rstrip +

HTML expect(cpp).to be_dirty @@ -723,10 +713,8 @@ describe CookedPostProcessor do it "generates overlay information using image title" do cpp.post_process - expect(cpp.html).to match_html <<~HTML -

+ expect(cpp.html).to match_html <<~HTML.rstrip +

HTML expect(cpp).to be_dirty @@ -753,10 +741,8 @@ describe CookedPostProcessor do it "generates overlay information using image alt" do cpp.post_process - expect(cpp.html).to match_html <<~HTML -

+ expect(cpp.html).to match_html <<~HTML.rstrip +

HTML expect(cpp).to be_dirty @@ -993,7 +979,7 @@ describe CookedPostProcessor do cpp = CookedPostProcessor.new(post, disable_loading_image: true) cpp.post_process - doc = Nokogiri::HTML::fragment(cpp.html) + doc = Nokogiri::HTML5::fragment(cpp.html) expect(doc.css('.lightbox-wrapper').size).to eq(1) expect(doc.css('img').first['srcset']).to_not eq(nil) end @@ -1008,7 +994,7 @@ describe CookedPostProcessor do cpp = CookedPostProcessor.new(post, disable_loading_image: true) cpp.post_process - doc = Nokogiri::HTML::fragment(cpp.html) + doc = Nokogiri::HTML5::fragment(cpp.html) expect(doc.css('.lightbox-wrapper').size).to eq(0) expect(doc.css('img').first['srcset']).to_not eq(nil) end @@ -1023,7 +1009,7 @@ describe CookedPostProcessor do cpp = CookedPostProcessor.new(post, disable_loading_image: true) cpp.post_process - doc = Nokogiri::HTML::fragment(cpp.html) + doc = Nokogiri::HTML5::fragment(cpp.html) expect(doc.css('.lightbox-wrapper').size).to eq(0) expect(doc.css('img').first['srcset']).to_not eq(nil) end @@ -1227,7 +1213,7 @@ describe CookedPostProcessor do it "uses schemaless url for uploads" do cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

Link

Google
@@ -1242,7 +1228,7 @@ describe CookedPostProcessor do it "uses schemaless CDN url for http uploads" do Rails.configuration.action_controller.stubs(:asset_host).returns("http://my.cdn.com") cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

Link

Google
@@ -1255,7 +1241,7 @@ describe CookedPostProcessor do it "doesn't use schemaless CDN url for https uploads" do Rails.configuration.action_controller.stubs(:asset_host).returns("https://my.cdn.com") cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

Link

Google
@@ -1269,7 +1255,7 @@ describe CookedPostProcessor do SiteSetting.login_required = true Rails.configuration.action_controller.stubs(:asset_host).returns("http://my.cdn.com") cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

Link

Google
@@ -1283,7 +1269,7 @@ describe CookedPostProcessor do SiteSetting.prevent_anons_from_downloading_files = true Rails.configuration.action_controller.stubs(:asset_host).returns("http://my.cdn.com") cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

Link

Google
@@ -1318,7 +1304,7 @@ describe CookedPostProcessor do cpp = CookedPostProcessor.new(the_post) cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

This post has a local emoji :+1: and an external upload

smallest.png

HTML @@ -1336,7 +1322,7 @@ describe CookedPostProcessor do cpp = CookedPostProcessor.new(the_post) cpp.optimize_urls - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

This post has a local emoji :+1: and an external upload

smallest.png

HTML @@ -1357,18 +1343,20 @@ describe CookedPostProcessor do the_post = Fabricate(:post, raw: "This post has an S3 video onebox:\n#{video_upload.url}") - cpp = CookedPostProcessor.new(the_post) + cpp = CookedPostProcessor.new(the_post.reload) + cpp.post_process_oneboxes + + cpp = CookedPostProcessor.new(the_post.reload) cpp.post_process_oneboxes expect(cpp.html).to match_html <<~HTML -

This post has an S3 video onebox:

-
- -
+

This post has an S3 video onebox:
+

HTML end @@ -1384,13 +1372,12 @@ describe CookedPostProcessor do secure_url = video_upload.url.sub(SiteSetting.s3_cdn_url, "#{Discourse.base_url}/secure-media-uploads") - expect(cpp.html).to match_html <<~HTML + expect(cpp.html).to match_html <<~HTML.rstrip

This post has an S3 video onebox:

@@ -1416,7 +1403,7 @@ describe CookedPostProcessor do stub_request(:head, audio_upload.url) stub_request(:get, image_upload.url) - raw = <<~RAW + raw = <<~RAW.rstrip This post has a video upload. #{video_upload.url} @@ -1435,19 +1422,17 @@ describe CookedPostProcessor do secure_video_url = video_upload.url.sub(SiteSetting.s3_cdn_url, "#{Discourse.base_url}/secure-media-uploads") secure_audio_url = audio_upload.url.sub(SiteSetting.s3_cdn_url, "#{Discourse.base_url}/secure-media-uploads") - expect(cpp.html).to match_html <<~HTML -

This post has a video upload.

+ expect(cpp.html).to match_html <<~HTML.rstrip +

This post has a video upload.

- -

This post has an audio upload.
-

+

This post has an audio upload.
+

And an image upload.
#{image_upload.original_filename}

@@ -1616,7 +1601,7 @@ describe CookedPostProcessor do let(:post) { build(:post) } let(:cpp) { CookedPostProcessor.new(post) } - let(:doc) { Nokogiri::HTML::fragment('

') } + let(:doc) { Nokogiri::HTML5::fragment('

') } it "is true when the image is inside a link" do img = doc.css("img#linked_image").first diff --git a/spec/components/email/styles_spec.rb b/spec/components/email/styles_spec.rb index bdcd27aa066..74addfe29dd 100644 --- a/spec/components/email/styles_spec.rb +++ b/spec/components/email/styles_spec.rb @@ -8,14 +8,14 @@ describe Email::Styles do def basic_fragment(html) styler = Email::Styles.new(html) styler.format_basic - Nokogiri::HTML.fragment(styler.to_html) + Nokogiri::HTML5.fragment(styler.to_html) end def html_fragment(html) styler = Email::Styles.new(html) styler.format_basic styler.format_html - Nokogiri::HTML.fragment(styler.to_html) + Nokogiri::HTML5.fragment(styler.to_html) end context "basic formatter" do diff --git a/spec/components/excerpt_parser_spec.rb b/spec/components/excerpt_parser_spec.rb index 8f0654bab5f..363dd049e0a 100644 --- a/spec/components/excerpt_parser_spec.rb +++ b/spec/components/excerpt_parser_spec.rb @@ -18,7 +18,7 @@ describe ExcerptParser do
HTML - expect(ExcerptParser.get_excerpt(html, 50, {})).to match_html(<<~HTML) + expect(ExcerptParser.get_excerpt(html, 50, {})).to match_html(<<~HTML.rstrip)
FOOBAR Lorem ipsum dolor sit amet, consectetur adi…
HTML diff --git a/spec/components/pretty_text_spec.rb b/spec/components/pretty_text_spec.rb index 784b607972a..8969a526bd5 100644 --- a/spec/components/pretty_text_spec.rb +++ b/spec/components/pretty_text_spec.rb @@ -184,7 +184,7 @@ describe PrettyText do