lib/onebox/helpers.rb in onebox-2.2.8 vs lib/onebox/helpers.rb in onebox-2.2.9
- old
+ new
@@ -22,12 +22,12 @@
def self.clean(html)
html.gsub(/<[^>]+>/, ' ').gsub(/\n/, '')
end
- def self.fetch_html_doc(url, headers = nil)
- response = (fetch_response(url, nil, nil, headers) rescue nil)
+ def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
+ response = (fetch_response(url, headers: headers, body_cacher: body_cacher) rescue nil)
doc = Nokogiri::HTML(response)
uri = Addressable::URI.parse(url)
ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]')
should_ignore_canonical = IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any?
@@ -35,28 +35,35 @@
unless (ignore_canonical_tag && ignore_canonical_tag['content'].to_s == 'true') || should_ignore_canonical
# prefer canonical link
canonical_link = doc.at('//link[@rel="canonical"]/@href')
canonical_uri = Addressable::URI.parse(canonical_link)
if canonical_link && "#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}"
- response = (fetch_response(canonical_uri.to_s, nil, nil, headers) rescue nil)
+ response = (fetch_response(canonical_uri.to_s, headers: headers, body_cacher: body_cacher) rescue nil)
doc = Nokogiri::HTML(response) if response
end
end
doc
end
- def self.fetch_response(location, limit = nil, domain = nil, headers = nil)
+ def self.fetch_response(location, redirect_limit: 5, domain: nil, headers: nil, body_cacher: nil)
+ redirect_limit = Onebox.options.redirect_limit if redirect_limit > Onebox.options.redirect_limit
- limit ||= 5
- limit = Onebox.options.redirect_limit if limit > Onebox.options.redirect_limit
+ raise Net::HTTPError.new('HTTP redirect too deep', location) if redirect_limit == 0
- raise Net::HTTPError.new('HTTP redirect too deep', location) if limit == 0
-
uri = Addressable::URI.parse(location)
uri = Addressable::URI.join(domain, uri) if !uri.host
+ use_body_cacher = body_cacher && body_cacher.respond_to?('fetch_cached_response_body')
+ if use_body_cacher
+ response_body = body_cacher.fetch_cached_response_body(uri.to_s)
+
+ if response_body.present?
+ return response_body
+ end
+ end
+
result = StringIO.new
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.normalized_scheme == 'https') do |http|
http.open_timeout = Onebox.options.connect_timeout
http.read_timeout = Onebox.options.timeout
http.verify_mode = OpenSSL::SSL::VERIFY_NONE # Work around path building bugs
@@ -84,19 +91,23 @@
code = response.code.to_i
unless code === 200
response.error! unless [301, 302].include?(code)
return fetch_response(
response['location'],
- limit - 1,
- "#{uri.scheme}://#{uri.host}",
- redir_header
+ redirect_limit: redirect_limit - 1,
+ domain: "#{uri.scheme}://#{uri.host}",
+ headers: redir_header
)
end
response.read_body do |chunk|
result.write(chunk)
raise DownloadTooLarge.new if result.size > size_bytes
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
+ end
+
+ if use_body_cacher && body_cacher.cache_response_body?(uri)
+ body_cacher.cache_response_body(uri.to_s, result.string)
end
return result.string
end
end