2021-05-26 05:41:35 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
require "json"
|
|
|
|
require "onebox/open_graph"
|
|
|
|
|
|
|
|
module Onebox
|
|
|
|
module Engine
|
|
|
|
class AmazonOnebox
|
|
|
|
include Engine
|
|
|
|
include LayoutSupport
|
|
|
|
include HTML
|
|
|
|
|
|
|
|
always_https
|
|
|
|
matches_regexp(
|
|
|
|
%r{^https?://(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)/},
|
|
|
|
)
|
|
|
|
|
|
|
|
def url
|
|
|
|
@raw ||= nil
|
|
|
|
|
|
|
|
# If possible, fetch the cached HTML body immediately so we can
|
|
|
|
# try to grab the canonical URL from that document,
|
|
|
|
# rather than guess at the best URL structure to use
|
|
|
|
if !@raw && has_cached_body
|
|
|
|
@raw = Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
|
|
|
|
end
|
|
|
|
|
|
|
|
if @raw
|
|
|
|
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
|
|
|
|
return canonical_link.to_s if canonical_link
|
|
|
|
end
|
|
|
|
|
|
|
|
if match && match[:id]
|
|
|
|
id =
|
|
|
|
Addressable::URI.encode_component(match[:id], Addressable::URI::CharacterClasses::PATH)
|
|
|
|
return "https://www.amazon.#{tld}/dp/#{id}"
|
|
|
|
end
|
|
|
|
|
|
|
|
@url
|
|
|
|
end
|
|
|
|
|
|
|
|
def tld
|
|
|
|
@tld ||= @@matcher.match(@url)["tld"]
|
|
|
|
end
|
|
|
|
|
|
|
|
def http_params
|
|
|
|
{ "User-Agent" => @options[:user_agent] } if @options && @options[:user_agent]
|
|
|
|
end
|
|
|
|
|
2021-06-01 16:23:18 -04:00
|
|
|
def to_html(ignore_errors = false)
|
|
|
|
unless ignore_errors
|
|
|
|
verified_data # forces a check for missing fields
|
|
|
|
return "" unless errors.empty?
|
|
|
|
end
|
|
|
|
|
|
|
|
super()
|
|
|
|
end
|
|
|
|
|
|
|
|
def placeholder_html
|
|
|
|
to_html(true)
|
|
|
|
end
|
|
|
|
|
|
|
|
def verified_data
|
|
|
|
@verified_data ||=
|
|
|
|
begin
|
|
|
|
result = data
|
2023-01-09 07:10:19 -05:00
|
|
|
|
2021-06-01 16:23:18 -04:00
|
|
|
required_tags = %i[title description]
|
|
|
|
required_tags.each do |tag|
|
|
|
|
if result[tag].blank?
|
|
|
|
errors[tag] ||= []
|
|
|
|
errors[tag] << "is blank"
|
2023-01-09 07:10:19 -05:00
|
|
|
end
|
2021-06-01 16:23:18 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
result
|
|
|
|
end
|
|
|
|
|
|
|
|
@verified_data
|
|
|
|
end
|
|
|
|
|
2021-05-26 05:41:35 -04:00
|
|
|
private
|
|
|
|
|
|
|
|
def has_cached_body
|
|
|
|
body_cacher&.respond_to?("cache_response_body?") &&
|
|
|
|
body_cacher.cache_response_body?(uri.to_s) &&
|
|
|
|
body_cacher.cached_response_body_exists?(uri.to_s)
|
|
|
|
end
|
|
|
|
|
|
|
|
def match
|
|
|
|
@match ||= @url.match(%r{(?:d|g)p/(?:product/|video/detail/)?(?<id>[A-Z0-9]+)(?:/|\?|$)}mi)
|
|
|
|
end
|
|
|
|
|
|
|
|
def image
|
|
|
|
if (main_image = raw.css("#main-image")) && main_image.any?
|
|
|
|
attributes = main_image.first.attributes
|
|
|
|
|
|
|
|
if attributes["data-a-hires"]
|
|
|
|
return attributes["data-a-hires"].to_s
|
|
|
|
elsif attributes["data-a-dynamic-image"]
|
|
|
|
return ::JSON.parse(attributes["data-a-dynamic-image"].value).keys.first
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if (landing_image = raw.css("#landingImage")) && landing_image.any?
|
|
|
|
attributes = landing_image.first.attributes
|
|
|
|
|
|
|
|
if attributes["data-old-hires"]
|
|
|
|
return attributes["data-old-hires"].to_s
|
|
|
|
else
|
|
|
|
return landing_image.first["src"].to_s
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if (ebook_image = raw.css("#ebooksImgBlkFront")) && ebook_image.any?
|
|
|
|
::JSON.parse(ebook_image.first.attributes["data-a-dynamic-image"].value).keys.first
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def price
|
|
|
|
# get item price (Amazon markup is inconsistent, deal with it)
|
|
|
|
if raw.css("#priceblock_ourprice .restOfPrice")[0] &&
|
|
|
|
raw.css("#priceblock_ourprice .restOfPrice")[0].inner_text
|
|
|
|
"#{raw.css("#priceblock_ourprice .restOfPrice")[0].inner_text}#{raw.css("#priceblock_ourprice .buyingPrice")[0].inner_text}.#{raw.css("#priceblock_ourprice .restOfPrice")[1].inner_text}"
|
|
|
|
elsif raw.css("#priceblock_dealprice") &&
|
|
|
|
(dealprice = raw.css("#priceblock_dealprice span")[0])
|
|
|
|
dealprice.inner_text
|
|
|
|
elsif !raw.css("#priceblock_ourprice").inner_text.empty?
|
|
|
|
raw.css("#priceblock_ourprice").inner_text
|
|
|
|
else
|
2022-03-08 15:24:45 -05:00
|
|
|
result = raw.css("#corePrice_feature_div .a-price .a-offscreen").first&.inner_text
|
2022-03-04 18:31:53 -05:00
|
|
|
if result.blank?
|
|
|
|
result = raw.css(".mediaMatrixListItem.a-active .a-color-price").inner_text
|
|
|
|
end
|
|
|
|
|
|
|
|
result
|
2021-05-26 05:41:35 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def multiple_authors(authors_xpath)
|
|
|
|
raw.xpath(authors_xpath).map { |a| a.inner_text.strip }.join(", ")
|
|
|
|
end
|
|
|
|
|
|
|
|
def data
|
|
|
|
og = ::Onebox::OpenGraph.new(raw)
|
|
|
|
|
|
|
|
if raw.at_css("#dp.book_mobile") # printed books
|
|
|
|
title = raw.at("h1#title")&.inner_text
|
|
|
|
authors =
|
2023-01-09 07:10:19 -05:00
|
|
|
(
|
2021-05-26 05:41:35 -04:00
|
|
|
if raw.at_css("#byline_secondary_view_div")
|
|
|
|
multiple_authors(
|
|
|
|
"//div[@id='byline_secondary_view_div']//span[@class='a-text-bold']",
|
|
|
|
)
|
2023-01-09 07:10:19 -05:00
|
|
|
else
|
2021-05-26 05:41:35 -04:00
|
|
|
raw.at("#byline")&.inner_text
|
2023-01-09 07:10:19 -05:00
|
|
|
end
|
|
|
|
)
|
2021-05-26 05:41:35 -04:00
|
|
|
rating =
|
|
|
|
raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text ||
|
|
|
|
raw.at("#cmrsArcLink .a-icon")&.inner_text
|
2023-01-09 07:10:19 -05:00
|
|
|
|
2021-05-26 05:41:35 -04:00
|
|
|
table_xpath =
|
|
|
|
"//div[@id='productDetails_secondary_view_div']//table[@id='productDetails_techSpec_section_1']"
|
|
|
|
isbn = raw.xpath("#{table_xpath}//tr[8]//td").inner_text.strip
|
|
|
|
|
|
|
|
# if ISBN is misplaced or absent it's hard to find out which data is
|
|
|
|
# available and where to find it so just set it all to nil
|
|
|
|
if /^\d(\-?\d){12}$/.match(isbn)
|
|
|
|
publisher = raw.xpath("#{table_xpath}//tr[1]//td").inner_text.strip
|
|
|
|
published = raw.xpath("#{table_xpath}//tr[2]//td").inner_text.strip
|
|
|
|
book_length = raw.xpath("#{table_xpath}//tr[6]//td").inner_text.strip
|
|
|
|
else
|
|
|
|
isbn = publisher = published = book_length = nil
|
|
|
|
end
|
|
|
|
|
|
|
|
result = {
|
|
|
|
link: url,
|
|
|
|
title: title,
|
|
|
|
by_info: authors,
|
|
|
|
image: og.image || image,
|
|
|
|
description: raw.at("#productDescription")&.inner_text,
|
|
|
|
rating: "#{rating}#{", " if rating && (!isbn&.empty? || !price&.empty?)}",
|
|
|
|
price: price,
|
|
|
|
isbn_asin_text: "ISBN",
|
|
|
|
isbn_asin: isbn,
|
|
|
|
publisher: publisher,
|
|
|
|
published: "#{published}#{", " if published && !price&.empty?}",
|
|
|
|
}
|
|
|
|
elsif raw.at_css("#dp.ebooks_mobile") # ebooks
|
|
|
|
title = raw.at("#ebooksTitle")&.inner_text
|
|
|
|
authors =
|
|
|
|
(
|
|
|
|
if raw.at_css("#a-popover-mobile-udp-contributor-popover-id")
|
|
|
|
multiple_authors(
|
|
|
|
"//div[@id='a-popover-mobile-udp-contributor-popover-id']//span[contains(@class,'a-text-bold')]",
|
|
|
|
)
|
2023-01-09 07:10:19 -05:00
|
|
|
else
|
2021-05-26 05:41:35 -04:00
|
|
|
(raw.at("#byline")&.inner_text&.strip || raw.at("#bylineInfo")&.inner_text&.strip)
|
2023-01-09 07:10:19 -05:00
|
|
|
end
|
|
|
|
)
|
2021-05-26 05:41:35 -04:00
|
|
|
rating =
|
|
|
|
raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text ||
|
|
|
|
raw.at("#cmrsArcLink .a-icon")&.inner_text ||
|
|
|
|
raw.at("#acrCustomerReviewLink .a-icon")&.inner_text
|
|
|
|
|
|
|
|
table_xpath = "//div[@id='detailBullets_secondary_view_div']//ul"
|
|
|
|
asin = raw.xpath("#{table_xpath}//li[4]/span/span[2]").inner_text
|
|
|
|
|
|
|
|
# if ASIN is misplaced or absent it's hard to find out which data is
|
|
|
|
# available and where to find it so just set it all to nil
|
|
|
|
if /^[0-9A-Z]{10}$/.match(asin)
|
|
|
|
publisher = raw.xpath("#{table_xpath}//li[2]/span/span[2]").inner_text
|
|
|
|
published = raw.xpath("#{table_xpath}//li[1]/span/span[2]").inner_text
|
|
|
|
else
|
|
|
|
asin = publisher = published = nil
|
|
|
|
end
|
|
|
|
|
|
|
|
result = {
|
|
|
|
link: url,
|
|
|
|
title: title,
|
|
|
|
by_info: authors,
|
|
|
|
image: og.image || image,
|
|
|
|
description: raw.at("#productDescription")&.inner_text,
|
|
|
|
rating: "#{rating}#{", " if rating && (!asin&.empty? || !price&.empty?)}",
|
|
|
|
price: price,
|
|
|
|
isbn_asin_text: "ASIN",
|
|
|
|
isbn_asin: asin,
|
|
|
|
publisher: publisher,
|
|
|
|
published: "#{published}#{", " if published && !price&.empty?}",
|
|
|
|
}
|
|
|
|
else
|
|
|
|
title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
|
|
|
|
result = { link: url, title: title, image: og.image || image, price: price }
|
|
|
|
|
|
|
|
result[:by_info] = raw.at("#by-line")
|
|
|
|
result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]
|
|
|
|
|
|
|
|
summary = raw.at("#productDescription")
|
|
|
|
|
2022-03-04 18:31:53 -05:00
|
|
|
description = og.description || summary&.inner_text&.strip
|
|
|
|
description = raw.css("meta[name=description]").first&.[]("content") if description.blank?
|
2021-05-26 05:41:35 -04:00
|
|
|
result[:description] = CGI.unescapeHTML(
|
|
|
|
Onebox::Helpers.truncate(description, 250),
|
|
|
|
) if description
|
|
|
|
end
|
|
|
|
|
|
|
|
result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
|
|
|
|
|
|
|
|
result
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|