diff --git a/lib/onebox/engine/standard_embed.rb b/lib/onebox/engine/standard_embed.rb index 79997ffb818..29e6caf48cd 100644 --- a/lib/onebox/engine/standard_embed.rb +++ b/lib/onebox/engine/standard_embed.rb @@ -1,8 +1,10 @@ # frozen_string_literal: true -require "cgi" -require "onebox/open_graph" +require 'cgi' +require 'onebox/normalizer' +require 'onebox/open_graph' require 'onebox/oembed' +require 'onebox/json_ld' module Onebox module Engine @@ -38,32 +40,14 @@ module Onebox def raw return @raw if defined?(@raw) - og = get_opengraph - twitter = get_twitter - oembed = get_oembed - @raw = {} - og.data.each do |k, v| - next if k == "title_attr" - v = og.send(k) - @raw[k] ||= v unless v.nil? - end - - twitter.each { |k, v| @raw[k] ||= v unless Onebox::Helpers::blank?(v) } - - oembed.data.each do |k, v| - v = oembed.send(k) - @raw[k] ||= v unless v.nil? - end - - favicon = get_favicon - @raw[:favicon] = favicon unless Onebox::Helpers::blank?(favicon) - - unless @raw[:description] - description = get_description - @raw[:description] = description unless Onebox::Helpers::blank?(description) - end + set_opengraph_data_on_raw + set_twitter_data_on_raw + set_oembed_data_on_raw + set_json_ld_data_on_raw + set_favicon_data_on_raw + set_description_on_raw @raw end @@ -154,6 +138,50 @@ module Onebox oembed_url end + + def get_json_ld + @json_ld ||= Onebox::JsonLd.new(html_doc) + end + + def set_from_normalizer_data(normalizer) + normalizer.data.each do |k, v| + v = normalizer.send(k) + @raw[k] ||= v unless v.nil? + end + end + + def set_opengraph_data_on_raw + og = get_opengraph + set_from_normalizer_data(og) + @raw.except!(:title_attr) + end + + def set_twitter_data_on_raw + twitter = get_twitter + twitter.each { |k, v| @raw[k] ||= v unless Onebox::Helpers::blank?(v) } + end + + def set_oembed_data_on_raw + oembed = get_oembed + set_from_normalizer_data(oembed) + end + + def set_json_ld_data_on_raw + json_ld = get_json_ld + set_from_normalizer_data(json_ld) + end + + def set_favicon_data_on_raw + favicon = get_favicon + @raw[:favicon] = favicon unless Onebox::Helpers::blank?(favicon) + end + + def set_description_on_raw + unless @raw[:description] + description = get_description + @raw[:description] = description unless Onebox::Helpers::blank?(description) + end + end end end end diff --git a/lib/onebox/json_ld.rb b/lib/onebox/json_ld.rb new file mode 100644 index 00000000000..c4f426ac3d4 --- /dev/null +++ b/lib/onebox/json_ld.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +module Onebox + class JsonLd < Normalizer + # Full schema.org hierarchy can be found here: https://schema.org/docs/full.html + MOVIE_JSON_LD_TYPE = "Movie" + + def initialize(doc) + @data = extract(doc) + end + + private + + def extract(doc) + extracted_json = extract_json_from(doc) + parsed_json = parse_json(extracted_json) + + extracted = + case parsed_json["@type"] + when MOVIE_JSON_LD_TYPE + Onebox::Movie.new(parsed_json) + else + {} + end + + extracted.to_h + end + + def extract_json_from(doc) + return {} if Onebox::Helpers::blank?(doc) + json_ld = doc.search('script[type="application/ld+json"]').text + return {} if Onebox::Helpers::blank?(json_ld) + json_ld + end + + def parse_json(json) + begin + JSON[json] + rescue JSON::ParserError => e + Discourse.warn_exception(e, message: "Error parsing JSON-LD json: #{json}") + {} + end + end + end +end diff --git a/lib/onebox/movie.rb b/lib/onebox/movie.rb new file mode 100644 index 00000000000..645005615bd --- /dev/null +++ b/lib/onebox/movie.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +module Onebox + class Movie + def initialize(json_ld_data) + @json_ld_data = json_ld_data + end + + def name + @json_ld_data['name'] + end + + def image + @json_ld_data['image'] + end + + def description + @json_ld_data['description'] + end + + def rating + @json_ld_data.dig('aggregateRating', 'ratingValue') + end + + def genres + @json_ld_data['genre'] + end + + def duration + return nil unless @json_ld_data['duration'] + + Time.parse(@json_ld_data['duration']).strftime '%H:%M' + end + + def to_h + { + name: name, + image: image, + description: description, + rating: rating, + genres: genres, + duration: duration + } + end + end +end diff --git a/lib/onebox/normalizer.rb b/lib/onebox/normalizer.rb new file mode 100644 index 00000000000..3eafb635c0b --- /dev/null +++ b/lib/onebox/normalizer.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +module Onebox + class Normalizer + attr_reader :data + + def get(attr, length = nil, sanitize = true) + return nil if Onebox::Helpers::blank?(data) + + value = data[attr] + + return nil if Onebox::Helpers::blank?(value) + + value = html_entities.decode(value) + value = Sanitize.fragment(value) if sanitize + value.strip! + value = Onebox::Helpers.truncate(value, length) unless length.nil? + + value + end + + def method_missing(attr, *args, &block) + value = get(attr, *args) + + return nil if Onebox::Helpers::blank?(value) + + method_name = attr.to_s + if method_name.end_with?(*integer_suffixes) + value.to_i + elsif method_name.end_with?(*url_suffixes) + result = Onebox::Helpers.normalize_url_for_output(value) + result unless Onebox::Helpers::blank?(result) + else + value + end + end + + private + + def integer_suffixes + ['width', 'height'] + end + + def url_suffixes + ['url', 'image', 'video'] + end + + def html_entities + @html_entities ||= HTMLEntities.new + end + end +end diff --git a/lib/onebox/oembed.rb b/lib/onebox/oembed.rb index 3068a161e23..d5c2bd20aa4 100644 --- a/lib/onebox/oembed.rb +++ b/lib/onebox/oembed.rb @@ -2,12 +2,11 @@ module Onebox class Oembed < OpenGraph - def initialize(response) @data = Onebox::Helpers.symbolize_keys(::MultiJson.load(response)) # never use oembed from WordPress 4.4 (it's broken) - data.delete(:html) if data[:html] && data[:html]["wp-embedded-content"] + @data.delete(:html) if @data[:html] && @data[:html]["wp-embedded-content"] end def html diff --git a/lib/onebox/open_graph.rb b/lib/onebox/open_graph.rb index f78e3d088da..5aae110fb04 100644 --- a/lib/onebox/open_graph.rb +++ b/lib/onebox/open_graph.rb @@ -1,9 +1,7 @@ # frozen_string_literal: true module Onebox - class OpenGraph - - attr_reader :data + class OpenGraph < Normalizer def initialize(doc) @data = extract(doc) @@ -23,51 +21,8 @@ module Onebox secure_url.to_s end - def method_missing(attr, *args, &block) - value = get(attr, *args) - - return nil if Onebox::Helpers::blank?(value) - - method_name = attr.to_s - if method_name.end_with?(*integer_suffixes) - value.to_i - elsif method_name.end_with?(*url_suffixes) - result = Onebox::Helpers.normalize_url_for_output(value) - result unless Onebox::Helpers::blank?(result) - else - value - end - end - - def get(attr, length = nil, sanitize = true) - return nil if Onebox::Helpers::blank?(data) - - value = data[attr] - - return nil if Onebox::Helpers::blank?(value) - - value = html_entities.decode(value) - value = Sanitize.fragment(value) if sanitize - value.strip! - value = Onebox::Helpers.truncate(value, length) unless length.nil? - - value - end - private - def integer_suffixes - ['width', 'height'] - end - - def url_suffixes - ['url', 'image', 'video'] - end - - def html_entities - @html_entities ||= HTMLEntities.new - end - def extract(doc) return {} if Onebox::Helpers::blank?(doc) @@ -88,6 +43,5 @@ module Onebox data end - end end diff --git a/lib/onebox/templates/allowlistedgeneric.mustache b/lib/onebox/templates/allowlistedgeneric.mustache index abc700be933..b988e33c168 100644 --- a/lib/onebox/templates/allowlistedgeneric.mustache +++ b/lib/onebox/templates/allowlistedgeneric.mustache @@ -4,6 +4,7 @@ {{#description}}

{{description}}

+ {{> json_ld_partials/movie}} {{/description}} {{#data_1}} diff --git a/lib/onebox/templates/json_ld_partials/movie.mustache b/lib/onebox/templates/json_ld_partials/movie.mustache new file mode 100644 index 00000000000..740670a458f --- /dev/null +++ b/lib/onebox/templates/json_ld_partials/movie.mustache @@ -0,0 +1,6 @@ +{{#rating}} +

Average Rating: {{rating}}

+{{/rating}} +{{#duration}} +

Duration: {{duration}}

+{{/duration}} diff --git a/spec/lib/onebox/engine/allowlisted_generic_onebox_spec.rb b/spec/lib/onebox/engine/allowlisted_generic_onebox_spec.rb index b46b2450d3a..e4dd0e905d1 100644 --- a/spec/lib/onebox/engine/allowlisted_generic_onebox_spec.rb +++ b/spec/lib/onebox/engine/allowlisted_generic_onebox_spec.rb @@ -217,6 +217,16 @@ describe Onebox::Engine::AllowlistedGenericOnebox do expect(onebox.to_html).to include("Rudy (1993) - IMDb") expect(onebox.to_html).to include("Rudy: Directed by David Anspaugh. With Sean Astin, Jon Favreau, Ned Beatty, Greta Lind. Rudy has always been told that he was too small to play college football.") end + + it 'shows rating' do + onebox = described_class.new("https://www.imdb.com/title/tt0108002/") + expect(onebox.to_html).to include("7.5") + end + + it 'shows duration' do + onebox = described_class.new("https://www.imdb.com/title/tt0108002/") + expect(onebox.to_html).to include("01:54") + end end end end diff --git a/spec/lib/onebox/engine/standard_embed_spec.rb b/spec/lib/onebox/engine/standard_embed_spec.rb new file mode 100644 index 00000000000..b9071e6c18d --- /dev/null +++ b/spec/lib/onebox/engine/standard_embed_spec.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +describe Onebox::Engine::StandardEmbed do + let(:host_class) do + Class.new do + include Onebox::Engine::StandardEmbed + + def options + {} + end + + def url + '' + end + end + end + let(:instance) { host_class.new } + + describe '#raw' do + it 'does not set title_attr from opengraph data' do + Onebox::Helpers.stubs(fetch_html_doc: nil) + Onebox::OpenGraph.any_instance.stubs(:data).returns({ description: "description", title_attr: "should not be returned" }) + Onebox::Oembed.any_instance.stubs(:data).returns({}) + + expect(instance.raw).to eq({ description: "description" }) + end + + it 'sets twitter data' do + html_doc = mocked_html_doc(twitter_data: { "name" => "twitter:url", "content" => "cool.url" }) + Onebox::Helpers.stubs(fetch_html_doc: html_doc) + + expect(instance.raw).to eq({ url: "cool.url" }) + end + + it 'does not override data with twitter data' do + html_doc = mocked_html_doc(twitter_data: { "name" => "twitter:title", "content" => "i do not want to override" }) + Onebox::OpenGraph.any_instance.stubs(:data).returns({ description: "description", title: "do not override me" }) + Onebox::Helpers.stubs(fetch_html_doc: html_doc) + + expect(instance.raw).to eq({ description: "description", title: "do not override me" }) + end + + it 'does not override data with oembed data' do + Onebox::Oembed.any_instance.stubs(:data).returns({ title: "i do not want to override" }) + html_doc = mocked_html_doc(twitter_data: { "name" => "twitter:title", "content" => "do not override me" }) + Onebox::Helpers.stubs(fetch_html_doc: html_doc) + + expect(instance.raw).to eq({ title: "do not override me" }) + end + + it 'sets oembed data' do + Onebox::Helpers.stubs(fetch_html_doc: nil) + Onebox::Oembed.any_instance.stubs(:data).returns({ description: "description" }) + + expect(instance.raw).to eq({ description: "description" }) + end + + it 'does not override data with json_ld data' do + Onebox::Helpers.stubs(fetch_html_doc: nil) + Onebox::JsonLd.any_instance.stubs(:data).returns({ title: "i do not want to override" }) + Onebox::Oembed.any_instance.stubs(:data).returns({ title: "do not override me" }) + + expect(instance.raw).to eq({ title: "do not override me" }) + end + end + + private + + def mocked_html_doc(twitter_data: nil) + html_doc = mock + html_doc.stubs(at_css: nil, at: nil) + stub_twitter(html_doc, twitter_data) + stub_favicon(html_doc) + stub_json_ld + html_doc + end + + def stub_twitter(html_doc, twitter_data = []) + html_doc.expects(:css).with('meta').at_least_once.returns([twitter_data]) + end + + def stub_favicon(html_doc) + html_doc.stubs(:css).with('link[rel="shortcut icon"], link[rel="icon shortcut"], link[rel="shortcut"], link[rel="icon"]').returns([]) + end + + def stub_json_ld + normalizer = mock + normalizer.stubs(:data).returns([]) + Onebox::JsonLd.stubs(new: normalizer) + end +end diff --git a/spec/lib/onebox/json_ld_spec.rb b/spec/lib/onebox/json_ld_spec.rb new file mode 100644 index 00000000000..027f1097ead --- /dev/null +++ b/spec/lib/onebox/json_ld_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'onebox/json_ld' + +describe Onebox::JsonLd do + it 'logs warning and returns an empty hash if received json is invalid' do + invalid_json = "{\"@type\":invalid-json}" + doc = Nokogiri::HTML("") + Discourse.expects(:warn_exception).with( + instance_of(JSON::ParserError), { message: "Error parsing JSON-LD json: #{invalid_json}" } + ) + + json_ld = described_class.new(doc) + + expect(json_ld.data).to eq({}) + end + + it 'returns an empty hash if there is no json_ld script tag' do + doc = Nokogiri::HTML("") + json_ld = described_class.new(doc) + expect(json_ld.data).to eq({}) + end + + it 'returns an empty hash if there is no json_ld data' do + doc = Nokogiri::HTML("") + json_ld = described_class.new(doc) + expect(json_ld.data).to eq({}) + end + + it 'returns an empty hash if the type of JSONLD data is not Movie' do + doc = Nokogiri::HTML("") + json_ld = described_class.new(doc) + expect(json_ld.data).to eq({}) + end + + it 'correctly normalizes the properties' do + doc = Nokogiri::HTML(onebox_response('imdb')) + json_ld = described_class.new(doc) + expect(json_ld.data).to eq(expected_movie_hash) + end + + private + + def expected_movie_hash + { + name: 'Rudy', + image: 'https://m.media-amazon.com/images/M/MV5BZGUzMDU1YmQtMzBkOS00MTNmLTg5ZDQtZjY5Njk4Njk2MmRlXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_.jpg', + description: 'Rudy has always been told that he was too small to play college football. But he is determined to overcome the odds and fulfill his dream of playing for Notre Dame.', + rating: 7.5, + genres: ['Biography', 'Drama', 'Sport'], + duration: '01:54' + } + end +end diff --git a/spec/lib/onebox/movie_spec.rb b/spec/lib/onebox/movie_spec.rb new file mode 100644 index 00000000000..0be75aab533 --- /dev/null +++ b/spec/lib/onebox/movie_spec.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +require 'onebox/movie' + +describe Onebox::Movie do + it 'returns a nil rating if there is no aggregateRating item in json_ld data' do + json_ld_data = json_ld_data_from_doc( + "" + ) + json_ld = described_class.new(json_ld_data) + expect(json_ld.rating).to eq(nil) + end + + it 'returns a nil rating if there is no ratingValue item in json_ld data' do + json_ld_data = json_ld_data_from_doc( + "" + ) + json_ld = described_class.new(json_ld_data) + expect(json_ld.rating).to eq(nil) + end + + it 'returns a nil if there is no duration in json_ld data' do + json_ld_data = json_ld_data_from_doc( + "" + ) + json_ld = described_class.new(json_ld_data) + expect(json_ld.duration).to eq(nil) + end + + it 'to_h returns hash version of the object' do + json_ld_data = json_ld_data_from_doc(onebox_response('imdb')) + movie = described_class.new(json_ld_data) + expect(movie.to_h).to eq(expected_movie_hash) + end + + private + + def json_ld_data_from_doc(html) + JSON[Nokogiri::HTML(html).search('script[type="application/ld+json"]').text] + end + + def expected_movie_hash + { + name: 'Rudy', + image: 'https://m.media-amazon.com/images/M/MV5BZGUzMDU1YmQtMzBkOS00MTNmLTg5ZDQtZjY5Njk4Njk2MmRlXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_.jpg', + description: 'Rudy has always been told that he was too small to play college football. But he is determined to overcome the odds and fulfill his dream of playing for Notre Dame.', + rating: 7.5, + genres: ['Biography', 'Drama', 'Sport'], + duration: '01:54' + } + end +end diff --git a/spec/lib/onebox/oembed_spec.rb b/spec/lib/onebox/oembed_spec.rb index de98ad358a6..03559f46737 100644 --- a/spec/lib/onebox/oembed_spec.rb +++ b/spec/lib/onebox/oembed_spec.rb @@ -3,7 +3,7 @@ require "onebox/oembed" describe Onebox::Oembed do - it "excludes html tags" do + it "excludes text tags" do json = '{"text": ""}' oembed = described_class.new(json) expect(oembed.text).to be_nil