2019-05-02 18:17:27 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2013-05-27 19:48:47 -04:00
|
|
|
class ExcerptParser < Nokogiri::XML::SAX::Document
|
|
|
|
attr_reader :excerpt
|
|
|
|
|
2021-05-24 05:05:24 -04:00
|
|
|
CUSTOM_EXCERPT_REGEX = /<\s*(span|div)[^>]*class\s*=\s*['"]excerpt['"][^>]*>/
|
2014-09-04 01:03:12 -04:00
|
|
|
|
2013-06-03 16:12:24 -04:00
|
|
|
def initialize(length, options = nil)
|
2013-05-27 19:48:47 -04:00
|
|
|
@length = length
|
2019-05-02 18:17:27 -04:00
|
|
|
@excerpt = +""
|
2013-05-27 19:48:47 -04:00
|
|
|
@current_length = 0
|
2013-06-03 16:12:24 -04:00
|
|
|
options || {}
|
2013-05-27 19:48:47 -04:00
|
|
|
@strip_links = options[:strip_links] == true
|
2017-11-28 06:27:43 -05:00
|
|
|
@strip_images = options[:strip_images] == true
|
2013-06-03 16:12:24 -04:00
|
|
|
@text_entities = options[:text_entities] == true
|
2013-06-05 18:54:46 -04:00
|
|
|
@markdown_images = options[:markdown_images] == true
|
2015-05-20 00:42:54 -04:00
|
|
|
@keep_newlines = options[:keep_newlines] == true
|
2015-12-14 08:46:15 -05:00
|
|
|
@keep_emoji_images = options[:keep_emoji_images] == true
|
2017-04-10 04:11:58 -04:00
|
|
|
@keep_onebox_source = options[:keep_onebox_source] == true
|
2019-11-02 07:44:04 -04:00
|
|
|
@keep_onebox_body = options[:keep_onebox_body] == true
|
2020-01-04 08:26:52 -05:00
|
|
|
@keep_quotes = options[:keep_quotes] == true
|
2022-11-29 21:42:15 -05:00
|
|
|
@keep_svg = options[:keep_svg] == true
|
2016-10-10 22:03:21 -04:00
|
|
|
@remap_emoji = options[:remap_emoji] == true
|
2014-07-17 07:32:17 -04:00
|
|
|
@start_excerpt = false
|
2023-05-23 03:33:55 -04:00
|
|
|
@start_hashtag_icon = false
|
2018-01-22 13:17:35 -05:00
|
|
|
@in_details_depth = 0
|
2013-05-27 19:48:47 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.get_excerpt(html, length, options)
|
2014-09-04 01:03:12 -04:00
|
|
|
html ||= ""
|
2021-05-24 05:05:24 -04:00
|
|
|
length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html
|
2014-09-04 01:03:12 -04:00
|
|
|
me = self.new(length, options)
|
2013-05-27 19:48:47 -04:00
|
|
|
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
2014-09-04 01:03:12 -04:00
|
|
|
catch(:done) { parser.parse(html) }
|
2014-12-10 06:52:51 -05:00
|
|
|
excerpt = me.excerpt.strip
|
2019-11-02 07:44:04 -04:00
|
|
|
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] ||
|
|
|
|
options[:keep_onebox_body]
|
2014-12-10 06:52:51 -05:00
|
|
|
excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
|
|
|
|
excerpt
|
2013-05-27 19:48:47 -04:00
|
|
|
end
|
|
|
|
|
2014-07-24 22:15:43 -04:00
|
|
|
def escape_attribute(v)
|
2014-10-06 20:37:27 -04:00
|
|
|
return "" unless v
|
|
|
|
|
|
|
|
v = v.dup
|
|
|
|
v.gsub!("&", "&")
|
|
|
|
v.gsub!("\"", """)
|
|
|
|
v.gsub!("<", "<")
|
|
|
|
v.gsub!(">", ">")
|
|
|
|
v
|
2014-07-24 22:15:43 -04:00
|
|
|
end
|
|
|
|
|
2013-06-05 18:54:46 -04:00
|
|
|
def include_tag(name, attributes)
|
2018-03-12 11:52:06 -04:00
|
|
|
characters(
|
|
|
|
"<#{name} #{attributes.map { |k, v| "#{k}=\"#{escape_attribute(v)}\"" }.join(" ")}>",
|
|
|
|
truncate: false,
|
|
|
|
count_it: false,
|
|
|
|
encode: false,
|
|
|
|
)
|
2013-06-05 18:54:46 -04:00
|
|
|
end
|
|
|
|
|
2013-05-27 19:48:47 -04:00
|
|
|
def start_element(name, attributes = [])
|
|
|
|
case name
|
|
|
|
when "img"
|
2015-07-23 11:02:03 -04:00
|
|
|
attributes = Hash[*attributes.flatten]
|
|
|
|
|
2017-11-28 06:27:43 -05:00
|
|
|
if attributes["class"]&.include?("emoji")
|
|
|
|
if @remap_emoji
|
|
|
|
title = (attributes["alt"] || "").gsub(":", "")
|
|
|
|
title = Emoji.lookup_unicode(title) || attributes["alt"]
|
|
|
|
return characters(title)
|
|
|
|
elsif @keep_emoji_images
|
|
|
|
return include_tag(name, attributes)
|
|
|
|
else
|
|
|
|
return characters(attributes["alt"])
|
2015-07-23 11:02:03 -04:00
|
|
|
end
|
2017-11-28 06:27:43 -05:00
|
|
|
end
|
2015-07-23 11:02:03 -04:00
|
|
|
|
2017-11-28 06:27:43 -05:00
|
|
|
unless @strip_images
|
2013-06-05 18:54:46 -04:00
|
|
|
# If include_images is set, include the image in markdown
|
|
|
|
characters("!") if @markdown_images
|
|
|
|
|
2017-04-11 00:12:51 -04:00
|
|
|
if !attributes["alt"].blank?
|
2013-05-27 19:48:47 -04:00
|
|
|
characters("[#{attributes["alt"]}]")
|
2017-04-11 00:12:51 -04:00
|
|
|
elsif !attributes["title"].blank?
|
2013-05-27 19:48:47 -04:00
|
|
|
characters("[#{attributes["title"]}]")
|
|
|
|
else
|
2015-01-23 04:57:01 -05:00
|
|
|
characters("[#{I18n.t "excerpt_image"}]")
|
2013-05-27 19:48:47 -04:00
|
|
|
end
|
2013-06-05 18:54:46 -04:00
|
|
|
|
|
|
|
characters("(#{attributes["src"]})") if @markdown_images
|
2017-11-28 06:27:43 -05:00
|
|
|
end
|
2013-05-27 19:48:47 -04:00
|
|
|
when "a"
|
|
|
|
unless @strip_links
|
2013-06-05 18:54:46 -04:00
|
|
|
include_tag(name, attributes)
|
2013-05-27 19:48:47 -04:00
|
|
|
@in_a = true
|
|
|
|
end
|
|
|
|
when "aside"
|
2017-04-11 03:13:21 -04:00
|
|
|
attributes = Hash[*attributes.flatten]
|
2023-02-16 04:40:11 -05:00
|
|
|
if !(@keep_onebox_source || @keep_onebox_body) || !attributes["class"]&.include?("onebox")
|
2018-01-22 13:17:35 -05:00
|
|
|
@in_quote = true
|
|
|
|
end
|
2017-04-11 03:13:21 -04:00
|
|
|
|
2020-01-20 16:09:23 -05:00
|
|
|
if attributes["class"]&.include?("quote")
|
2020-01-04 08:26:52 -05:00
|
|
|
if @keep_quotes || (@keep_onebox_body && attributes["data-topic"].present?)
|
|
|
|
@in_quote = false
|
|
|
|
end
|
2019-11-02 07:44:04 -04:00
|
|
|
end
|
2017-04-10 04:11:58 -04:00
|
|
|
when "article"
|
2019-11-02 07:44:04 -04:00
|
|
|
@in_quote = !@keep_onebox_body if attributes.include?(%w[class onebox-body])
|
|
|
|
when "header"
|
|
|
|
@in_quote = !@keep_onebox_source if attributes.include?(%w[class source])
|
2014-02-20 03:48:30 -05:00
|
|
|
when "div", "span"
|
2023-05-23 03:33:55 -04:00
|
|
|
attributes = Hash[*attributes.flatten]
|
2023-07-28 11:07:53 -04:00
|
|
|
|
2023-07-28 13:47:22 -04:00
|
|
|
# Only match "excerpt" class if it does not specifically equal "excerpt
|
2023-07-28 11:07:53 -04:00
|
|
|
# hidden" in order to prevent internal links with GitHub oneboxes from
|
|
|
|
# being empty https://meta.discourse.org/t/269436
|
|
|
|
if attributes["class"]&.include?("excerpt") && !attributes["class"]&.match?("excerpt hidden")
|
2019-05-02 18:17:27 -04:00
|
|
|
@excerpt = +""
|
2014-09-03 03:12:56 -04:00
|
|
|
@current_length = 0
|
2014-07-17 07:32:17 -04:00
|
|
|
@start_excerpt = true
|
2023-05-23 03:33:55 -04:00
|
|
|
elsif attributes["class"]&.include?("hashtag-icon-placeholder")
|
|
|
|
@start_hashtag_icon = true
|
|
|
|
include_tag(name, attributes)
|
2014-07-17 07:32:17 -04:00
|
|
|
end
|
2017-12-19 17:28:55 -05:00
|
|
|
when "details"
|
2018-01-22 13:17:35 -05:00
|
|
|
@in_details_depth += 1
|
2017-12-19 17:28:55 -05:00
|
|
|
when "summary"
|
2018-01-22 13:17:35 -05:00
|
|
|
if @in_details_depth == 1 && !@in_summary
|
|
|
|
@in_summary = true
|
2024-12-12 03:09:49 -05:00
|
|
|
characters("▶ ", truncate: false, count_it: false, encode: false)
|
2018-01-22 13:17:35 -05:00
|
|
|
end
|
2022-11-29 21:42:15 -05:00
|
|
|
when "svg"
|
|
|
|
attributes = Hash[*attributes.flatten]
|
2022-11-30 19:56:16 -05:00
|
|
|
if attributes["class"]&.include?("d-icon") && @keep_svg
|
2022-11-29 21:42:15 -05:00
|
|
|
include_tag(name, attributes)
|
|
|
|
@in_svg = true
|
|
|
|
end
|
|
|
|
when "use"
|
|
|
|
include_tag(name, attributes) if @in_svg && @keep_svg
|
2013-05-27 19:48:47 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def end_element(name)
|
|
|
|
case name
|
|
|
|
when "a"
|
|
|
|
unless @strip_links
|
2018-03-12 11:52:06 -04:00
|
|
|
characters("</a>", truncate: false, count_it: false, encode: false)
|
2013-05-27 19:48:47 -04:00
|
|
|
@in_a = false
|
|
|
|
end
|
|
|
|
when "p", "br"
|
2015-05-20 00:42:54 -04:00
|
|
|
if @keep_newlines
|
2018-03-12 11:52:06 -04:00
|
|
|
characters("<br>", truncate: false, count_it: false, encode: false)
|
2015-05-20 00:42:54 -04:00
|
|
|
else
|
|
|
|
characters(" ")
|
|
|
|
end
|
2013-05-27 19:48:47 -04:00
|
|
|
when "aside"
|
|
|
|
@in_quote = false
|
2017-12-19 17:28:55 -05:00
|
|
|
when "details"
|
2018-01-22 13:17:35 -05:00
|
|
|
@in_details_depth -= 1
|
2017-12-20 15:44:36 -05:00
|
|
|
when "summary"
|
2018-01-22 13:17:35 -05:00
|
|
|
@in_summary = false if @in_details_depth == 1
|
2014-02-20 03:48:30 -05:00
|
|
|
when "div", "span"
|
2014-07-17 07:32:17 -04:00
|
|
|
throw :done if @start_excerpt
|
2023-05-23 03:33:55 -04:00
|
|
|
characters("</span>", truncate: false, count_it: false, encode: false) if @start_hashtag_icon
|
2022-11-29 21:42:15 -05:00
|
|
|
when "svg"
|
2023-01-23 23:40:24 -05:00
|
|
|
characters("</svg>", truncate: false, count_it: false, encode: false) if @keep_svg
|
2022-11-29 21:42:15 -05:00
|
|
|
@in_svg = false
|
|
|
|
when "use"
|
2023-01-23 23:40:24 -05:00
|
|
|
characters("</use>", truncate: false, count_it: false, encode: false) if @keep_svg
|
2013-05-27 19:48:47 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2017-12-20 15:44:36 -05:00
|
|
|
def clean(str)
|
|
|
|
ERB::Util.html_escape(str.strip)
|
|
|
|
end
|
|
|
|
|
2018-03-12 11:52:06 -04:00
|
|
|
def characters(
|
|
|
|
string,
|
|
|
|
truncate: true,
|
|
|
|
count_it: true,
|
|
|
|
encode: true,
|
|
|
|
before_string: nil,
|
|
|
|
after_string: nil
|
|
|
|
)
|
2024-12-12 03:09:49 -05:00
|
|
|
return if @in_quote || @in_details_depth > 1 || (@in_details_depth == 1 && !@in_summary)
|
2017-12-19 17:28:55 -05:00
|
|
|
|
2017-08-17 16:13:21 -04:00
|
|
|
# we call length on this so might as well ensure we have a string
|
|
|
|
string = string.to_s
|
|
|
|
|
2018-03-12 11:52:06 -04:00
|
|
|
@excerpt << before_string if before_string
|
|
|
|
|
2013-05-27 19:48:47 -04:00
|
|
|
encode = encode ? lambda { |s| ERB::Util.html_escape(s) } : lambda { |s| s }
|
|
|
|
if count_it && @current_length + string.length > @length
|
|
|
|
length = [0, @length - @current_length - 1].max
|
2021-01-10 21:43:11 -05:00
|
|
|
@excerpt << encode.call(string[0..length]) if truncate && !emoji?(string)
|
2013-06-03 16:12:24 -04:00
|
|
|
@excerpt << (@text_entities ? "..." : "…")
|
2013-05-27 19:48:47 -04:00
|
|
|
@excerpt << "</a>" if @in_a
|
2018-03-12 11:52:06 -04:00
|
|
|
@excerpt << after_string if after_string
|
2013-05-27 19:48:47 -04:00
|
|
|
throw :done
|
|
|
|
end
|
2018-03-12 11:52:06 -04:00
|
|
|
|
2013-05-27 19:48:47 -04:00
|
|
|
@excerpt << encode.call(string)
|
2018-03-12 11:52:06 -04:00
|
|
|
@excerpt << after_string if after_string
|
2013-05-27 19:48:47 -04:00
|
|
|
@current_length += string.length if count_it
|
|
|
|
end
|
2021-01-10 18:40:41 -05:00
|
|
|
|
|
|
|
def emoji?(string)
|
2021-01-10 21:43:11 -05:00
|
|
|
string.match?(/\A:\w+:\Z/)
|
2021-01-10 18:40:41 -05:00
|
|
|
end
|
2013-05-27 19:48:47 -04:00
|
|
|
end
|