module URLCanonicalize # Make an HTTP request class Request def fetch handle_response end private attr_reader :http, :http_method def initialize(http, http_method = :head) @http = http @http_method = http_method end def response @response ||= http.request request # Some URLs can throw an exception here end def request @request ||= request_for_method end def handle_response case response when Net::HTTPSuccess handle_success when Net::HTTPRedirection handle_redirection else handle_failure end rescue *NETWORK_EXCEPTIONS => e handle_failure(e.class, e.message) end def handle_success @canonical_url = $LAST_MATCH_INFO['url'] if response['link'] =~ /<(?.+)>\s*;\s*rel="canonical"/i if http_method == :head self.http_method = :get fetch else enhanced_response end end def handle_redirection case response when Net::HTTPFound, Net::HTTPMovedTemporarily, Net::HTTPTemporaryRedirect self.http_method = :get handle_success else URLCanonicalize::Response::Redirect.new(response['location']) end end def handle_failure(klass = response.class, message = response.message) URLCanonicalize::Response::Failure.new(klass, message) end def enhanced_response if canonical_url response_plus = URLCanonicalize::Response::Success.new(canonical_url, response, html) URLCanonicalize::Response::CanonicalFound.new(canonical_url, response_plus) else URLCanonicalize::Response::Success.new(url, response, html) end end def html @html ||= Nokogiri::HTML response.body end def canonical_url_element @canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first end def canonical_url @canonical_url ||= canonical_url_element['href'] if canonical_url_element.is_a?(Nokogiri::XML::Element) end def uri @uri ||= http.uri end def url @url ||= uri.to_s end def host @host ||= uri.host end def request_for_method r = base_request headers.each { |header_key, header_value| r[header_key] = header_value } r end def base_request check_http_method case http_method when :head Net::HTTP::Head.new uri when :get Net::HTTP::Get.new uri else raise URLCanonicalize::Exception::Request, "Unknown method: #{method}" end end def headers @headers ||= { 'Accept-Language' => 'en-US,en;q=0.8', 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; WOW64) '\ 'AppleWebKit/537.36 (KHTML, like Gecko) '\ 'Chrome/51.0.2704.103 Safari/537.36' } end def http_method=(value) @http_method = value @request = nil @response = nil @html = nil end # Some sites treat HEAD requests as suspicious activity and block the # requester after a few attempts. For these sites we'll use GET requests # only def check_http_method @http_method = :get if host =~ /(linkedin|crunchbase).com/ end NETWORK_EXCEPTIONS = [ EOFError, Errno::ECONNREFUSED, Errno::ECONNRESET, Errno::EHOSTUNREACH, Errno::EINVAL, Errno::ENETUNREACH, Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout, OpenSSL::SSL::SSLError, SocketError, Timeout::Error ].freeze end end