lib/anemone/page.rb in anemone-0.0.2 vs lib/anemone/page.rb in anemone-0.0.3

- old
+ new

@@ -1,159 +1,184 @@ -require 'anemone/http' -require 'hpricot' - -module Anemone - class Page - # The URL of the page - attr_reader :url - # Array of distinct A tag HREFs from the page - attr_reader :links - #Body of the HTTP response - attr_reader :body - #Content-type of the HTTP response - attr_reader :content_type - - # Integer response code of the page - attr_accessor :code - # Array of redirect-aliases for the page - attr_accessor :aliases - # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths! - attr_accessor :visited - # Used by PageHash#shortest_paths! to store depth of the page - attr_accessor :depth - - # - # Create a new Page from the response of an HTTP request to *url* - # - def self.fetch(url) - begin - url = URI(url) if url.is_a?(String) - - response, code, location = Anemone::HTTP.get(url) - - aka = nil - if !url.eql?(location) - aka = location - end - - return Page.new(url, response.body, code, response['Content-Type'], aka) - rescue - return Page.new(url) - end - end - - # - # Create a new page - # - def initialize(url, body = nil, code = nil, content_type = nil, aka = nil) - @url = url - @body = body unless Anemone.options.discard_page_bodies - @code = code - @content_type = content_type - @links = [] - @aliases = [] - - @aliases << aka if !aka.nil? - - #get a list of distinct links on the page, in absolute url form - if body - Hpricot(body).search('a').each do |a| - u = a['href'] - next if u.nil? - - begin - u = URI(u) - rescue - next - end - - abs = to_absolute(u) - @links << abs if in_domain?(abs) - end - - @links.uniq! - end - end - - - # - # Return a new page with the same *response* and *url*, but - # with a 200 response code - # - def alias_clone(url) - p = clone - p.add_alias!(@aka) if !@aka.nil? - p.code = 200 - p - end - - # - # Add a redirect-alias String *aka* to the list of the page's aliases - # - # Returns *self* - # - def add_alias!(aka) - @aliases << aka if !@aliases.include?(aka) - self - end - - # - # Returns an Array of all links from this page, and all the - # redirect-aliases of those pages, as String objects. - # - # *page_hash* is a PageHash object with the results of the current crawl. - # - def links_and_their_aliases(page_hash) - @links.inject([]) do |results, link| - results.concat([link].concat(page_hash[link].aliases)) - end - end - - # - # Returns +true+ if the page is a HTML document, returns +false+ - # otherwise. - # - def html? - (@content_type =~ /text\/html/) == 0 - end - - # - # Returns +true+ if the page is a HTTP redirect, returns +false+ - # otherwise. - # - def redirect? - (300..399).include?(@code) - end - - # - # Returns +true+ if the page was not found (returned 404 code), - # returns +false+ otherwise. - # - def not_found? - 404 == @code - end - - # - # Converts relative URL *link* into an absolute URL based on the - # location of the page - # - def to_absolute(link) - # remove anchor - link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) - - relative = URI(link) - absolute = @url.merge(relative) - - absolute.path = '/' if absolute.path.empty? - - return absolute - end - - # - # Returns +true+ if *uri* is in the same domain as the page, returns - # +false+ otherwise - # - def in_domain?(uri) - uri.host == @url.host - end - end -end \ No newline at end of file +require 'anemone/http' +require 'hpricot' + +module Anemone + class Page + # The URL of the page + attr_reader :url + # Array of distinct A tag HREFs from the page + attr_reader :links + #Body of the HTTP response + attr_reader :body + #Content-type of the HTTP response + attr_reader :content_type + #title of the page if it is an HTML document + attr_reader :title + #first h1 on the page, if present + attr_reader :h1 + #first h2 on the page, if present + attr_reader :h2 + #meta-description of the page, if present + attr_reader :description + + # Integer response code of the page + attr_accessor :code + # Array of redirect-aliases for the page + attr_accessor :aliases + # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths! + attr_accessor :visited + # Used by PageHash#shortest_paths! to store depth of the page + attr_accessor :depth + + # + # Create a new Page from the response of an HTTP request to *url* + # + def self.fetch(url) + begin + url = URI(url) if url.is_a?(String) + + response, code, location = Anemone::HTTP.get(url) + + aka = nil + if !url.eql?(location) + aka = location + end + + return Page.new(url, response.body, code, response['Content-Type'], aka) + rescue + return Page.new(url) + end + end + + # + # Create a new page + # + def initialize(url, body = nil, code = nil, content_type = nil, aka = nil) + @url = url + @body = body unless Anemone.options.discard_page_bodies + @code = code + @content_type = content_type + @links = [] + @aliases = [] + + @aliases << aka if !aka.nil? + + if body + h = Hpricot(body) + + #save page title + title_elem = h.at('title') + @title = title_elem.inner_html if !title_elem.nil? + + #save page h1 + h1_elem = h.at('h1') + @h1 = h1_elem.inner_html if !h1_elem.nil? + + #save page h2 + h2_elem = h.at('h2') + @h2 = h2_elem.inner_html if !h2_elem.nil? + + #save page meta-description + description_elem = h.at('meta[@name=description]') + @description = description_elem['content'] if !description_elem.nil? + + #get a list of distinct links on the page, in absolute url form + h.search('a').each do |a| + u = a['href'] + next if u.nil? + + begin + abs = to_absolute(URI(u)) + rescue + next + end + + @links << abs if in_domain?(abs) + end + + @links.uniq! + end + end + + + # + # Return a new page with the same *response* and *url*, but + # with a 200 response code + # + def alias_clone(url) + p = clone + p.add_alias!(@aka) if !@aka.nil? + p.code = 200 + p + end + + # + # Add a redirect-alias String *aka* to the list of the page's aliases + # + # Returns *self* + # + def add_alias!(aka) + @aliases << aka if !@aliases.include?(aka) + self + end + + # + # Returns an Array of all links from this page, and all the + # redirect-aliases of those pages, as String objects. + # + # *page_hash* is a PageHash object with the results of the current crawl. + # + def links_and_their_aliases(page_hash) + @links.inject([]) do |results, link| + results.concat([link].concat(page_hash[link].aliases)) + end + end + + # + # Returns +true+ if the page is a HTML document, returns +false+ + # otherwise. + # + def html? + (@content_type =~ /text\/html/) == 0 + end + + # + # Returns +true+ if the page is a HTTP redirect, returns +false+ + # otherwise. + # + def redirect? + (300..399).include?(@code) + end + + # + # Returns +true+ if the page was not found (returned 404 code), + # returns +false+ otherwise. + # + def not_found? + 404 == @code + end + + # + # Converts relative URL *link* into an absolute URL based on the + # location of the page + # + def to_absolute(link) + # remove anchor + link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) + + relative = URI(link) + absolute = @url.merge(relative) + + absolute.path = '/' if absolute.path.empty? + + return absolute + end + + # + # Returns +true+ if *uri* is in the same domain as the page, returns + # +false+ otherwise + # + def in_domain?(uri) + uri.host == @url.host + end + end +end