lib/anemone/page.rb in anemone-0.2.0 vs lib/anemone/page.rb in anemone-0.2.1

- old
+ new

@@ -1,16 +1,13 @@ -require 'anemone/http' require 'nokogiri' require 'ostruct' module Anemone class Page # The URL of the page attr_reader :url - # Array of distinct A tag HREFs from the page - attr_reader :links # Headers of the HTTP response attr_reader :headers # OpenStruct for user-stored data attr_accessor :data @@ -25,78 +22,49 @@ # Depth of this page from the root of the crawl. This is not necessarily the # shortest path; use PageHash#shortest_paths! to find that value. attr_accessor :depth # URL of the page that brought us to this page attr_accessor :referer + # Response time of the request for this page in milliseconds + attr_accessor :response_time # - # Create a new Page from the response of an HTTP request to *url* - # - def self.fetch(url, from_page = nil) - begin - url = URI(url) unless url.is_a?(URI) - - if from_page - referer = from_page.url - depth = from_page.depth + 1 - end - - response, code, location = Anemone::HTTP.get(url, referer) - - aka = nil - if !url.eql?(location) - aka = location - end - - return Page.new(url, response.body, code, response.to_hash, aka, referer, depth) - rescue - return Page.new(url) - end - end - - # # Create a new page # - def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0) + def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil) @url = url @code = code @headers = headers - @links = [] - @aliases = [] + @headers['content-type'] ||= [''] + @aliases = Array(aka) @data = OpenStruct.new @referer = referer @depth = depth || 0 + @response_time = response_time + @doc = Nokogiri::HTML(body) if body && html? rescue nil + end - @aliases << aka if !aka.nil? - - if body - begin - @doc = Nokogiri::HTML(body) - rescue - return - end - - return if @doc.nil? - - #get a list of distinct links on the page, in absolute url form - @doc.css('a').each do |a| - u = a.attributes['href'].content if a.attributes['href'] - next if u.nil? - - begin - abs = to_absolute(URI(u)) - rescue - next - end - - @links << abs if in_domain?(abs) - end - - @links.uniq! + # Array of distinct A tag HREFs from the page + def links + return @links unless @links.nil? + @links = [] + return @links if !doc + + doc.css('a').each do |a| + u = a.attributes['href'].content rescue nil + next if u.nil? or u.empty? + abs = to_absolute(URI(u)) rescue next + @links << abs if in_domain?(abs) end + @links.uniq! + @links end + def discard_doc! + links # force parsing of page links before we trash the document + @doc = nil + end # # Return a new page with the same *response* and *url*, but # with a 200 response code # @@ -122,27 +90,27 @@ # redirect-aliases of those pages, as String objects. # # *page_hash* is a PageHash object with the results of the current crawl. # def links_and_their_aliases(page_hash) - @links.inject([]) do |results, link| + links.inject([]) do |results, link| results.concat([link].concat(page_hash[link].aliases)) end end # # The content-type returned by the HTTP request for this page # def content_type - @headers['content-type'][0] rescue nil + headers['content-type'].first end # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? - (@content_type =~ /text\/html/) == 0 + !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end # # Returns +true+ if the page is a HTTP redirect, returns +false+ # otherwise.