lib/url_canonicalize/request.rb in url_canonicalize-0.0.5 vs lib/url_canonicalize/request.rb in url_canonicalize-0.0.6

- old
+ new

@@ -45,24 +45,29 @@ enhanced_response end end def handle_redirection + puts response['location'] # debug + case response when Net::HTTPFound, Net::HTTPMovedTemporarily, Net::HTTPTemporaryRedirect self.http_method = :get handle_success else - URLCanonicalize::Response::Redirect.new(response['location']) + location = relative_to_absolute(response['location']) + URLCanonicalize::Response::Redirect.new(location) end end def handle_failure(klass = response.class, message = response.message) URLCanonicalize::Response::Failure.new(klass, message) end def enhanced_response + puts canonical_url # debug + if canonical_url response_plus = URLCanonicalize::Response::Success.new(canonical_url, response, html) URLCanonicalize::Response::CanonicalFound.new(canonical_url, response_plus) else URLCanonicalize::Response::Success.new(url, response, html) @@ -71,18 +76,22 @@ def html @html ||= Nokogiri::HTML response.body end - def canonical_url_element - @canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first + def canonical_url + @canonical_url ||= relative_to_absolute(canonical_url_raw) end - def canonical_url + def canonical_url_raw @canonical_url ||= canonical_url_element['href'] if canonical_url_element.is_a?(Nokogiri::XML::Element) end + def canonical_url_element + @canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first + end + def uri @uri ||= http.uri end def url @@ -131,9 +140,23 @@ # Some sites treat HEAD requests as suspicious activity and block the # requester after a few attempts. For these sites we'll use GET requests # only def check_http_method @http_method = :get if host =~ /(linkedin|crunchbase).com/ + end + + def relative_to_absolute(partial_url) + return unless partial_url + partial_uri = ::URI.parse(partial_url) + + if partial_uri.host + partial_url # It's already absolute + else + base_uri = uri.dup || ::URI.parse(url) + base_uri.path = partial_url + puts base_uri.to_s # debug + base_uri.to_s + end end NETWORK_EXCEPTIONS = [ EOFError, Errno::ECONNREFUSED,