lib/anemone/page.rb in anemone-0.2.3 vs lib/anemone/page.rb in anemone-0.3.0

- old
+ new

@@ -6,150 +6,144 @@ # The URL of the page attr_reader :url # Headers of the HTTP response attr_reader :headers - + # URL of the page this one redirected to, if any + attr_reader :redirect_to + # Exception object, if one was raised during HTTP#fetch_page + attr_reader :error + # OpenStruct for user-stored data attr_accessor :data - # Nokogiri document for the HTML body - attr_accessor :doc # Integer response code of the page - attr_accessor :code - # Array of redirect-aliases for the page - attr_accessor :aliases - # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths! + attr_accessor :code + # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths! attr_accessor :visited # Depth of this page from the root of the crawl. This is not necessarily the - # shortest path; use PageHash#shortest_paths! to find that value. + # shortest path; use PageStore#shortest_paths! to find that value. attr_accessor :depth # URL of the page that brought us to this page attr_accessor :referer # Response time of the request for this page in milliseconds attr_accessor :response_time - + # # Create a new page # - def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil) + def initialize(url, params = {}) @url = url - @code = code - @headers = headers || {} - @headers['content-type'] ||= [''] - @aliases = Array(aka) @data = OpenStruct.new - @referer = referer - @depth = depth || 0 - @response_time = response_time - @doc = Nokogiri::HTML(body) if body && html? rescue nil + + @code = params[:code] + @headers = params[:headers] || {} + @headers['content-type'] ||= [''] + @aliases = Array(params[:aka]).compact + @referer = params[:referer] + @depth = params[:depth] || 0 + @redirect_to = to_absolute(params[:redirect_to]) + @response_time = params[:response_time] + @body = params[:body] + @error = params[:error] + + @fetched = !params[:code].nil? end # Array of distinct A tag HREFs from the page def links return @links unless @links.nil? @links = [] return @links if !doc - + doc.css('a').each do |a| u = a.attributes['href'].content rescue nil next if u.nil? or u.empty? abs = to_absolute(URI(u)) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end - + + # Nokogiri document for the HTML body + def doc + return @doc if @doc + @doc = Nokogiri::HTML(@body) if @body && html? rescue nil + end + + # Delete the Nokogiri document and response body to conserve memory def discard_doc! links # force parsing of page links before we trash the document - @doc = nil + @doc = @body = nil end - - # - # Return a new page with the same *response* and *url*, but - # with a 200 response code - # - def alias_clone(url) - p = clone - p.add_alias!(@aka) if !@aka.nil? - p.code = 200 - p + + def fetched? + @fetched end # - # Add a redirect-alias String *aka* to the list of the page's aliases - # - # Returns *self* - # - def add_alias!(aka) - @aliases << aka if !@aliases.include?(aka) - self - end - - # - # Returns an Array of all links from this page, and all the - # redirect-aliases of those pages, as String objects. - # - # *page_hash* is a PageHash object with the results of the current crawl. - # - def links_and_their_aliases(page_hash) - links.inject([]) do |results, link| - results.concat([link].concat(page_hash[link].aliases)) - end - end - - # # The content-type returned by the HTTP request for this page # def content_type headers['content-type'].first end - + # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end - + # # Returns +true+ if the page is a HTTP redirect, returns +false+ # otherwise. - # + # def redirect? (300..399).include?(@code) end - + # # Returns +true+ if the page was not found (returned 404 code), # returns +false+ otherwise. # def not_found? 404 == @code end - + # # Converts relative URL *link* into an absolute URL based on the # location of the page # def to_absolute(link) + return nil if link.nil? + # remove anchor link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) relative = URI(link) absolute = @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end - + # # Returns +true+ if *uri* is in the same domain as the page, returns # +false+ otherwise # def in_domain?(uri) uri.host == @url.host end + + def marshal_dump + [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] + end + + def marshal_load(ary) + @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary + end + end end