require 'anemone/http' require 'hpricot' module Anemone class Page # The URL of the page attr_reader :url # Array of distinct A tag HREFs from the page attr_reader :links # Integer response code of the page attr_reader :code # Array of redirect-aliases for the page attr_accessor :aliases # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths! attr_accessor :visited # Used by PageHash#shortest_paths! to store depth of the page attr_accessor :depth # # Create a new Page from the response of an HTTP request to *url* # def self.fetch(url) begin url = URI(url) if url.is_a?(String) response, code, location = Anemone::HTTP.get(url) aka = nil if !url.eql?(location) aka = location end return Page.new(url, response, code, aka) rescue return Page.new(url) end end # # Create a new page # def initialize(url, response = nil, code = nil, aka = nil) @url = url @response = response @code = code @links = [] @aliases = [] @aliases << aka if !aka.nil? #get a list of distinct links on the page, in absolute url form if @response and @response.body Hpricot(@response.body).search('a').each do |a| u = a['href'] next if u.nil? begin u = URI(u) rescue next end abs = to_absolute(u) @links << abs if in_domain?(abs) end @links.uniq! end end # # Return a new page with the same *response* and *url*, but # with a 200 response code # def alias_clone(url) Page.new(url, @response, 200, @url) end # # Add a redirect-alias String *aka* to the list of the page's aliases # # Returns *self* # def add_alias!(aka) @aliases << aka if !@aliases.include?(aka) self end # # Returns an Array of all links from this page, and all the # redirect-aliases of those pages, as String objects. # # *page_hash* is a PageHash object with the results of the current crawl. # def links_and_their_aliases(page_hash) @links.inject([]) do |results, link| results.concat([link].concat(page_hash[link].aliases)) end end # # Returns the response body for the page # def body @response.body end # # Returns the +Content-Type+ header for the page # def content_type @response['Content-Type'] end # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? (content_type =~ /text\/html/) == 0 end # # Returns +true+ if the page is a HTTP redirect, returns +false+ # otherwise. # def redirect? (300..399).include?(@code) end # # Returns +true+ if the page was not found (returned 404 code), # returns +false+ otherwise. # def not_found? 404 == @code end # # Converts relative URL *link* into an absolute URL based on the # location of the page # def to_absolute(link) # remove anchor link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) relative = URI(link) absolute = @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end # # Returns +true+ if *uri* is in the same domain as the page, returns # +false+ otherwise # def in_domain?(uri) uri.host == @url.host end end end