lib/anemone/page.rb in anemone-0.4.0 vs lib/anemone/page.rb in anemone-0.5.0

- old
+ new

@@ -57,12 +57,12 @@ def links return @links unless @links.nil? @links = [] return @links if !doc - doc.css('a').each do |a| - u = a.attributes['href'].content rescue nil + doc.search("//a[@href]").each do |a| + u = a['href'] next if u.nil? or u.empty? abs = to_absolute(URI(u)) rescue next @links << abs if in_domain?(abs) end @links.uniq! @@ -118,11 +118,11 @@ # # Returns +true+ if the page is a HTTP redirect, returns +false+ # otherwise. # def redirect? - (300..399).include?(@code) + (300..307).include?(@code) end # # Returns +true+ if the page was not found (returned 404 code), # returns +false+ otherwise. @@ -163,7 +163,40 @@ def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end + def to_hash + {'url' => @url.to_s, + 'headers' => Marshal.dump(@headers), + 'data' => Marshal.dump(@data), + 'body' => @body, + 'links' => links.map(&:to_s), + 'code' => @code, + 'visited' => @visited, + 'depth' => @depth, + 'referer' => @referer.to_s, + 'redirect_to' => @redirect_to.to_s, + 'response_time' => @response_time, + 'fetched' => @fetched} + end + + def self.from_hash(hash) + page = self.new(URI(hash['url'])) + {'@headers' => Marshal.load(hash['headers']), + '@data' => Marshal.load(hash['data']), + '@body' => hash['body'], + '@links' => hash['links'].map { |link| URI(link) }, + '@code' => hash['code'].to_i, + '@visited' => hash['visited'], + '@depth' => hash['depth'].to_i, + '@referer' => hash['referer'], + '@redirect_to' => URI(hash['redirect_to']), + '@response_time' => hash['response_time'].to_i, + '@fetched' => hash['fetched'] + }.each do |var, value| + page.instance_variable_set(var, value) + end + page + end end end