lib/anemone/page.rb in anemone-0.0.2 vs lib/anemone/page.rb in anemone-0.0.3
- old
+ new
@@ -1,159 +1,184 @@
-require 'anemone/http'
-require 'hpricot'
-
-module Anemone
- class Page
- # The URL of the page
- attr_reader :url
- # Array of distinct A tag HREFs from the page
- attr_reader :links
- #Body of the HTTP response
- attr_reader :body
- #Content-type of the HTTP response
- attr_reader :content_type
-
- # Integer response code of the page
- attr_accessor :code
- # Array of redirect-aliases for the page
- attr_accessor :aliases
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
- attr_accessor :visited
- # Used by PageHash#shortest_paths! to store depth of the page
- attr_accessor :depth
-
- #
- # Create a new Page from the response of an HTTP request to *url*
- #
- def self.fetch(url)
- begin
- url = URI(url) if url.is_a?(String)
-
- response, code, location = Anemone::HTTP.get(url)
-
- aka = nil
- if !url.eql?(location)
- aka = location
- end
-
- return Page.new(url, response.body, code, response['Content-Type'], aka)
- rescue
- return Page.new(url)
- end
- end
-
- #
- # Create a new page
- #
- def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
- @url = url
- @body = body unless Anemone.options.discard_page_bodies
- @code = code
- @content_type = content_type
- @links = []
- @aliases = []
-
- @aliases << aka if !aka.nil?
-
- #get a list of distinct links on the page, in absolute url form
- if body
- Hpricot(body).search('a').each do |a|
- u = a['href']
- next if u.nil?
-
- begin
- u = URI(u)
- rescue
- next
- end
-
- abs = to_absolute(u)
- @links << abs if in_domain?(abs)
- end
-
- @links.uniq!
- end
- end
-
-
- #
- # Return a new page with the same *response* and *url*, but
- # with a 200 response code
- #
- def alias_clone(url)
- p = clone
- p.add_alias!(@aka) if !@aka.nil?
- p.code = 200
- p
- end
-
- #
- # Add a redirect-alias String *aka* to the list of the page's aliases
- #
- # Returns *self*
- #
- def add_alias!(aka)
- @aliases << aka if !@aliases.include?(aka)
- self
- end
-
- #
- # Returns an Array of all links from this page, and all the
- # redirect-aliases of those pages, as String objects.
- #
- # *page_hash* is a PageHash object with the results of the current crawl.
- #
- def links_and_their_aliases(page_hash)
- @links.inject([]) do |results, link|
- results.concat([link].concat(page_hash[link].aliases))
- end
- end
-
- #
- # Returns +true+ if the page is a HTML document, returns +false+
- # otherwise.
- #
- def html?
- (@content_type =~ /text\/html/) == 0
- end
-
- #
- # Returns +true+ if the page is a HTTP redirect, returns +false+
- # otherwise.
- #
- def redirect?
- (300..399).include?(@code)
- end
-
- #
- # Returns +true+ if the page was not found (returned 404 code),
- # returns +false+ otherwise.
- #
- def not_found?
- 404 == @code
- end
-
- #
- # Converts relative URL *link* into an absolute URL based on the
- # location of the page
- #
- def to_absolute(link)
- # remove anchor
- link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
-
- relative = URI(link)
- absolute = @url.merge(relative)
-
- absolute.path = '/' if absolute.path.empty?
-
- return absolute
- end
-
- #
- # Returns +true+ if *uri* is in the same domain as the page, returns
- # +false+ otherwise
- #
- def in_domain?(uri)
- uri.host == @url.host
- end
- end
-end
\ No newline at end of file
+require 'anemone/http'
+require 'hpricot'
+
+module Anemone
+ class Page
+ # The URL of the page
+ attr_reader :url
+ # Array of distinct A tag HREFs from the page
+ attr_reader :links
+ #Body of the HTTP response
+ attr_reader :body
+ #Content-type of the HTTP response
+ attr_reader :content_type
+ #title of the page if it is an HTML document
+ attr_reader :title
+ #first h1 on the page, if present
+ attr_reader :h1
+ #first h2 on the page, if present
+ attr_reader :h2
+ #meta-description of the page, if present
+ attr_reader :description
+
+ # Integer response code of the page
+ attr_accessor :code
+ # Array of redirect-aliases for the page
+ attr_accessor :aliases
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+ attr_accessor :visited
+ # Used by PageHash#shortest_paths! to store depth of the page
+ attr_accessor :depth
+
+ #
+ # Create a new Page from the response of an HTTP request to *url*
+ #
+ def self.fetch(url)
+ begin
+ url = URI(url) if url.is_a?(String)
+
+ response, code, location = Anemone::HTTP.get(url)
+
+ aka = nil
+ if !url.eql?(location)
+ aka = location
+ end
+
+ return Page.new(url, response.body, code, response['Content-Type'], aka)
+ rescue
+ return Page.new(url)
+ end
+ end
+
+ #
+ # Create a new page
+ #
+ def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
+ @url = url
+ @body = body unless Anemone.options.discard_page_bodies
+ @code = code
+ @content_type = content_type
+ @links = []
+ @aliases = []
+
+ @aliases << aka if !aka.nil?
+
+ if body
+ h = Hpricot(body)
+
+ #save page title
+ title_elem = h.at('title')
+ @title = title_elem.inner_html if !title_elem.nil?
+
+ #save page h1
+ h1_elem = h.at('h1')
+ @h1 = h1_elem.inner_html if !h1_elem.nil?
+
+ #save page h2
+ h2_elem = h.at('h2')
+ @h2 = h2_elem.inner_html if !h2_elem.nil?
+
+ #save page meta-description
+ description_elem = h.at('meta[@name=description]')
+ @description = description_elem['content'] if !description_elem.nil?
+
+ #get a list of distinct links on the page, in absolute url form
+ h.search('a').each do |a|
+ u = a['href']
+ next if u.nil?
+
+ begin
+ abs = to_absolute(URI(u))
+ rescue
+ next
+ end
+
+ @links << abs if in_domain?(abs)
+ end
+
+ @links.uniq!
+ end
+ end
+
+
+ #
+ # Return a new page with the same *response* and *url*, but
+ # with a 200 response code
+ #
+ def alias_clone(url)
+ p = clone
+ p.add_alias!(@aka) if !@aka.nil?
+ p.code = 200
+ p
+ end
+
+ #
+ # Add a redirect-alias String *aka* to the list of the page's aliases
+ #
+ # Returns *self*
+ #
+ def add_alias!(aka)
+ @aliases << aka if !@aliases.include?(aka)
+ self
+ end
+
+ #
+ # Returns an Array of all links from this page, and all the
+ # redirect-aliases of those pages, as String objects.
+ #
+ # *page_hash* is a PageHash object with the results of the current crawl.
+ #
+ def links_and_their_aliases(page_hash)
+ @links.inject([]) do |results, link|
+ results.concat([link].concat(page_hash[link].aliases))
+ end
+ end
+
+ #
+ # Returns +true+ if the page is a HTML document, returns +false+
+ # otherwise.
+ #
+ def html?
+ (@content_type =~ /text\/html/) == 0
+ end
+
+ #
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
+ # otherwise.
+ #
+ def redirect?
+ (300..399).include?(@code)
+ end
+
+ #
+ # Returns +true+ if the page was not found (returned 404 code),
+ # returns +false+ otherwise.
+ #
+ def not_found?
+ 404 == @code
+ end
+
+ #
+ # Converts relative URL *link* into an absolute URL based on the
+ # location of the page
+ #
+ def to_absolute(link)
+ # remove anchor
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+
+ relative = URI(link)
+ absolute = @url.merge(relative)
+
+ absolute.path = '/' if absolute.path.empty?
+
+ return absolute
+ end
+
+ #
+ # Returns +true+ if *uri* is in the same domain as the page, returns
+ # +false+ otherwise
+ #
+ def in_domain?(uri)
+ uri.host == @url.host
+ end
+ end
+end