lib/anemone/page.rb in anemone-0.4.0 vs lib/anemone/page.rb in anemone-0.5.0
- old
+ new
@@ -57,12 +57,12 @@
def links
return @links unless @links.nil?
@links = []
return @links if !doc
- doc.css('a').each do |a|
- u = a.attributes['href'].content rescue nil
+ doc.search("//a[@href]").each do |a|
+ u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
@links << abs if in_domain?(abs)
end
@links.uniq!
@@ -118,11 +118,11 @@
#
# Returns +true+ if the page is a HTTP redirect, returns +false+
# otherwise.
#
def redirect?
- (300..399).include?(@code)
+ (300..307).include?(@code)
end
#
# Returns +true+ if the page was not found (returned 404 code),
# returns +false+ otherwise.
@@ -163,7 +163,40 @@
def marshal_load(ary)
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end
+ def to_hash
+ {'url' => @url.to_s,
+ 'headers' => Marshal.dump(@headers),
+ 'data' => Marshal.dump(@data),
+ 'body' => @body,
+ 'links' => links.map(&:to_s),
+ 'code' => @code,
+ 'visited' => @visited,
+ 'depth' => @depth,
+ 'referer' => @referer.to_s,
+ 'redirect_to' => @redirect_to.to_s,
+ 'response_time' => @response_time,
+ 'fetched' => @fetched}
+ end
+
+ def self.from_hash(hash)
+ page = self.new(URI(hash['url']))
+ {'@headers' => Marshal.load(hash['headers']),
+ '@data' => Marshal.load(hash['data']),
+ '@body' => hash['body'],
+ '@links' => hash['links'].map { |link| URI(link) },
+ '@code' => hash['code'].to_i,
+ '@visited' => hash['visited'],
+ '@depth' => hash['depth'].to_i,
+ '@referer' => hash['referer'],
+ '@redirect_to' => URI(hash['redirect_to']),
+ '@response_time' => hash['response_time'].to_i,
+ '@fetched' => hash['fetched']
+ }.each do |var, value|
+ page.instance_variable_set(var, value)
+ end
+ page
+ end
end
end