lib/onebox/helpers.rb in onebox-2.2.8 vs lib/onebox/helpers.rb in onebox-2.2.9

- old
+ new

@@ -22,12 +22,12 @@ def self.clean(html) html.gsub(/<[^>]+>/, ' ').gsub(/\n/, '') end - def self.fetch_html_doc(url, headers = nil) - response = (fetch_response(url, nil, nil, headers) rescue nil) + def self.fetch_html_doc(url, headers = nil, body_cacher = nil) + response = (fetch_response(url, headers: headers, body_cacher: body_cacher) rescue nil) doc = Nokogiri::HTML(response) uri = Addressable::URI.parse(url) ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]') should_ignore_canonical = IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any? @@ -35,28 +35,35 @@ unless (ignore_canonical_tag && ignore_canonical_tag['content'].to_s == 'true') || should_ignore_canonical # prefer canonical link canonical_link = doc.at('//link[@rel="canonical"]/@href') canonical_uri = Addressable::URI.parse(canonical_link) if canonical_link && "#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}" - response = (fetch_response(canonical_uri.to_s, nil, nil, headers) rescue nil) + response = (fetch_response(canonical_uri.to_s, headers: headers, body_cacher: body_cacher) rescue nil) doc = Nokogiri::HTML(response) if response end end doc end - def self.fetch_response(location, limit = nil, domain = nil, headers = nil) + def self.fetch_response(location, redirect_limit: 5, domain: nil, headers: nil, body_cacher: nil) + redirect_limit = Onebox.options.redirect_limit if redirect_limit > Onebox.options.redirect_limit - limit ||= 5 - limit = Onebox.options.redirect_limit if limit > Onebox.options.redirect_limit + raise Net::HTTPError.new('HTTP redirect too deep', location) if redirect_limit == 0 - raise Net::HTTPError.new('HTTP redirect too deep', location) if limit == 0 - uri = Addressable::URI.parse(location) uri = Addressable::URI.join(domain, uri) if !uri.host + use_body_cacher = body_cacher && body_cacher.respond_to?('fetch_cached_response_body') + if use_body_cacher + response_body = body_cacher.fetch_cached_response_body(uri.to_s) + + if response_body.present? + return response_body + end + end + result = StringIO.new Net::HTTP.start(uri.host, uri.port, use_ssl: uri.normalized_scheme == 'https') do |http| http.open_timeout = Onebox.options.connect_timeout http.read_timeout = Onebox.options.timeout http.verify_mode = OpenSSL::SSL::VERIFY_NONE # Work around path building bugs @@ -84,19 +91,23 @@ code = response.code.to_i unless code === 200 response.error! unless [301, 302].include?(code) return fetch_response( response['location'], - limit - 1, - "#{uri.scheme}://#{uri.host}", - redir_header + redirect_limit: redirect_limit - 1, + domain: "#{uri.scheme}://#{uri.host}", + headers: redir_header ) end response.read_body do |chunk| result.write(chunk) raise DownloadTooLarge.new if result.size > size_bytes raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout + end + + if use_body_cacher && body_cacher.cache_response_body?(uri) + body_cacher.cache_response_body(uri.to_s, result.string) end return result.string end end