page.rb in anemone-0.2.1

- old
+ new

@@ -1,16 +1,13 @@
-require 'anemone/http'
 require 'nokogiri'
 require 'ostruct'
 
 module Anemone
   class Page
 
     # The URL of the page
     attr_reader :url
-    # Array of distinct A tag HREFs from the page
-    attr_reader :links
     # Headers of the HTTP response
     attr_reader :headers
     
     # OpenStruct for user-stored data
     attr_accessor :data
@@ -25,78 +22,49 @@
     # Depth of this page from the root of the crawl. This is not necessarily the
     # shortest path; use PageHash#shortest_paths! to find that value.
     attr_accessor :depth
     # URL of the page that brought us to this page
     attr_accessor :referer
+    # Response time of the request for this page in milliseconds
+    attr_accessor :response_time
     
     #
-    # Create a new Page from the response of an HTTP request to *url*
-    #
-    def self.fetch(url, from_page = nil)
-      begin
-        url = URI(url) unless url.is_a?(URI)
-
-        if from_page
-          referer = from_page.url
-          depth = from_page.depth + 1
-        end
-
-        response, code, location = Anemone::HTTP.get(url, referer)
-
-        aka = nil
-        if !url.eql?(location)
-          aka = location
-        end
-
-        return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
-      rescue
-        return Page.new(url)
-      end
-    end
-    
-    #
     # Create a new page
     #
-    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
+    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
       @url = url
       @code = code
       @headers = headers
-      @links = []
-      @aliases = []
+      @headers['content-type'] ||= ['']
+      @aliases = Array(aka)
       @data = OpenStruct.new
       @referer = referer
       @depth = depth || 0
+      @response_time = response_time
+      @doc = Nokogiri::HTML(body) if body && html? rescue nil
+    end
 
-      @aliases << aka if !aka.nil?
-
-      if body
-        begin
-          @doc = Nokogiri::HTML(body)
-        rescue
-          return
-        end
-
-        return if @doc.nil?
-
-        #get a list of distinct links on the page, in absolute url form
-        @doc.css('a').each do |a| 
-          u = a.attributes['href'].content if a.attributes['href']
-          next if u.nil?
-          
-          begin
-            abs = to_absolute(URI(u))
-          rescue
-            next
-          end
-
-          @links << abs if in_domain?(abs)
-        end
-        
-        @links.uniq!
+    # Array of distinct A tag HREFs from the page
+    def links
+      return @links unless @links.nil?
+      @links = []
+      return @links if !doc
+      
+      doc.css('a').each do |a|
+        u = a.attributes['href'].content rescue nil
+        next if u.nil? or u.empty?
+        abs = to_absolute(URI(u)) rescue next
+        @links << abs if in_domain?(abs)
       end
+      @links.uniq!
+      @links
     end
     
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = nil
+    end
     
     #
     # Return a new page with the same *response* and *url*, but
     # with a 200 response code
     #    
@@ -122,27 +90,27 @@
     # redirect-aliases of those pages, as String objects.
     #
     # *page_hash* is a PageHash object with the results of the current crawl.
     #
     def links_and_their_aliases(page_hash)
-      @links.inject([]) do |results, link|
+      links.inject([]) do |results, link|
         results.concat([link].concat(page_hash[link].aliases))
       end
     end
     
     #
     # The content-type returned by the HTTP request for this page
     #
     def content_type
-      @headers['content-type'][0] rescue nil
+      headers['content-type'].first
     end
     
     #
     # Returns +true+ if the page is a HTML document, returns +false+
     # otherwise.
     #
     def html?
-      (@content_type =~ /text\/html/) == 0
+      !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
     end
     
     #
     # Returns +true+ if the page is a HTTP redirect, returns +false+
     # otherwise.