page.rb in anemone-0.3.0

- old
+ new
@@ -6,150 +6,144 @@
 
     # The URL of the page
     attr_reader :url
     # Headers of the HTTP response
     attr_reader :headers
-    
+    # URL of the page this one redirected to, if any
+    attr_reader :redirect_to
+    # Exception object, if one was raised during HTTP#fetch_page
+    attr_reader :error
+
     # OpenStruct for user-stored data
     attr_accessor :data
-    # Nokogiri document for the HTML body
-    attr_accessor :doc
     # Integer response code of the page
-    attr_accessor :code	
-    # Array of redirect-aliases for the page
-    attr_accessor :aliases
-    # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+    attr_accessor :code
+    # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
     attr_accessor :visited
     # Depth of this page from the root of the crawl. This is not necessarily the
-    # shortest path; use PageHash#shortest_paths! to find that value.
+    # shortest path; use PageStore#shortest_paths! to find that value.
     attr_accessor :depth
     # URL of the page that brought us to this page
     attr_accessor :referer
     # Response time of the request for this page in milliseconds
     attr_accessor :response_time
-    
+
     #
     # Create a new page
     #
-    def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
+    def initialize(url, params = {})
       @url = url
-      @code = code
-      @headers = headers || {}
-      @headers['content-type'] ||= ['']
-      @aliases = Array(aka)
       @data = OpenStruct.new
-      @referer = referer
-      @depth = depth || 0
-      @response_time = response_time
-      @doc = Nokogiri::HTML(body) if body && html? rescue nil
+
+      @code = params[:code]
+      @headers = params[:headers] || {}
+      @headers['content-type'] ||= ['']
+      @aliases = Array(params[:aka]).compact
+      @referer = params[:referer]
+      @depth = params[:depth] || 0
+      @redirect_to = to_absolute(params[:redirect_to])
+      @response_time = params[:response_time]
+      @body = params[:body]
+      @error = params[:error]
+
+      @fetched = !params[:code].nil?
     end
 
     # Array of distinct A tag HREFs from the page
     def links
       return @links unless @links.nil?
       @links = []
       return @links if !doc
-      
+
       doc.css('a').each do |a|
         u = a.attributes['href'].content rescue nil
         next if u.nil? or u.empty?
         abs = to_absolute(URI(u)) rescue next
         @links << abs if in_domain?(abs)
       end
       @links.uniq!
       @links
     end
-    
+
+    # Nokogiri document for the HTML body
+    def doc
+      return @doc if @doc
+      @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
+    end
+
+    # Delete the Nokogiri document and response body to conserve memory
     def discard_doc!
       links # force parsing of page links before we trash the document
-      @doc = nil
+      @doc = @body = nil
     end
-    
-    #
-    # Return a new page with the same *response* and *url*, but
-    # with a 200 response code
-    #    
-    def alias_clone(url)
-      p = clone
-	  p.add_alias!(@aka) if !@aka.nil?
-	  p.code = 200
-	  p
+
+    def fetched?
+      @fetched
     end
 
     #
-    # Add a redirect-alias String *aka* to the list of the page's aliases
-    #
-    # Returns *self*
-    #
-    def add_alias!(aka)
-      @aliases << aka if !@aliases.include?(aka)
-      self
-    end
-    
-    #
-    # Returns an Array of all links from this page, and all the 
-    # redirect-aliases of those pages, as String objects.
-    #
-    # *page_hash* is a PageHash object with the results of the current crawl.
-    #
-    def links_and_their_aliases(page_hash)
-      links.inject([]) do |results, link|
-        results.concat([link].concat(page_hash[link].aliases))
-      end
-    end
-    
-    #
     # The content-type returned by the HTTP request for this page
     #
     def content_type
       headers['content-type'].first
     end
-    
+
     #
     # Returns +true+ if the page is a HTML document, returns +false+
     # otherwise.
     #
     def html?
       !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
     end
-    
+
     #
     # Returns +true+ if the page is a HTTP redirect, returns +false+
     # otherwise.
-    #    
+    #
     def redirect?
       (300..399).include?(@code)
     end
-    
+
     #
     # Returns +true+ if the page was not found (returned 404 code),
     # returns +false+ otherwise.
     #
     def not_found?
       404 == @code
     end
-    
+
     #
     # Converts relative URL *link* into an absolute URL based on the
     # location of the page
     #
     def to_absolute(link)
+      return nil if link.nil?
+
       # remove anchor
       link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
 
       relative = URI(link)
       absolute = @url.merge(relative)
 
       absolute.path = '/' if absolute.path.empty?
 
       return absolute
     end
-    
+
     #
     # Returns +true+ if *uri* is in the same domain as the page, returns
     # +false+ otherwise
     #
     def in_domain?(uri)
       uri.host == @url.host
     end
+
+    def marshal_dump
+      [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
+    end
+
+    def marshal_load(ary)
+      @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
+    end
+
   end
 end