lib/onebox/engine/amazon_onebox.rb in onebox-2.2.9 vs lib/onebox/engine/amazon_onebox.rb in onebox-2.2.10
- old
+ new
@@ -12,21 +12,24 @@
always_https
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)
def url
- # Have we cached the HTML body of the requested URL?
- # If so, try to grab the canonical URL from that document,
+ # If possible, fetch the cached HTML body immediately so we can
+ # try to grab the canonical URL from that document,
# rather than guess at the best URL structure to use
- if @body_cacher && @body_cacher.respond_to?('cache_response_body?')
- if @body_cacher.cached_response_body_exists?(uri.to_s)
- @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, @body_cacher)
- canonical_link = @raw.at('//link[@rel="canonical"]/@href')
- return canonical_link.to_s if canonical_link
+ if body_cacher&.respond_to?('cache_response_body?')
+ if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
+ @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
end
end
+ if @raw
+ canonical_link = @raw.at('//link[@rel="canonical"]/@href')
+ return canonical_link.to_s if canonical_link
+ end
+
if match && match[:id]
return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
end
@url
@@ -43,11 +46,11 @@
end
private
def match
- @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[^\/]+)(?:\/|$)/mi)
+ @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
end
def image
if (main_image = raw.css("#main-image")) && main_image.any?
attributes = main_image.first.attributes
@@ -58,10 +61,14 @@
return ::JSON.parse(attributes["data-a-dynamic-image"].value).keys.first
end
end
if (landing_image = raw.css("#landingImage")) && landing_image.any?
+ attributes = landing_image.first.attributes
+
+ return attributes["data-old-hires"].to_s if attributes["data-old-hires"]
+
landing_image.first["src"].to_s
end
if (ebook_image = raw.css("#ebooksImgBlkFront")) && ebook_image.any?
::JSON.parse(ebook_image.first.attributes["data-a-dynamic-image"].value).keys.first
@@ -108,11 +115,11 @@
else
isbn = publisher = published = book_length = nil
end
result = {
- link: link,
+ link: url,
title: title,
by_info: authors,
image: og.image || image,
description: raw.at("#productDescription")&.inner_text,
rating: "#{rating}#{', ' if rating && (!isbn&.empty? || !price&.empty?)}",
@@ -139,11 +146,11 @@
else
asin = publisher = published = nil
end
result = {
- link: link,
+ link: url,
title: title,
by_info: authors,
image: og.image || image,
description: raw.at("#productDescription")&.inner_text,
rating: "#{rating}#{', ' if rating && (!asin&.empty? || !price&.empty?)}",
@@ -155,20 +162,23 @@
}
else
title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
result = {
- link: link,
+ link: url,
title: title,
image: og.image || image,
price: price
}
result[:by_info] = raw.at("#by-line")
result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]
summary = raw.at("#productDescription")
- result[:description] = og.description || (summary && summary.inner_text) || CGI.unescapeHTML(Onebox::Helpers.truncate(raw.css("meta[name=description]").first["content"], 250))
+
+ description = og.description || summary&.inner_text
+ description ||= raw.css("meta[name=description]").first&.[]("content")
+ result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
end
result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
result