lib/spidr/page.rb in spidr-0.1.6 vs lib/spidr/page.rb in spidr-0.1.7

- old
+ new

@@ -21,10 +21,11 @@ # _response_. # def initialize(url,response) @url = url @response = response + @headers = response.to_hash @doc = nil end # # Returns the response code from the page. @@ -190,10 +191,12 @@ # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object # will be returned. Other content-types will cause +nil+ to be # returned. # def doc + return nil if (body.nil? || body.empty?) + begin if html? return @doc ||= Nokogiri::HTML(body) elsif xml? return @doc ||= Nokogiri::XML(body) @@ -207,14 +210,29 @@ # Returns all links from the HTML page. # def links urls = [] - if (html? && self.doc) - self.doc.search('a[@href]').each do |a| - url = a.get_attribute('href') + add_url = lambda { |url| + urls << url unless (url.nil? || url.empty?) + } - urls << url unless url.empty? + case code + when 300..303, 307 + add_url.call(@headers['location']) + end + + if (html? && doc) + doc.search('a[@href]').each do |a| + add_url.call(a.get_attribute('href')) + end + + doc.search('frame[@src]').each do |iframe| + add_url.call(iframe.get_attribute('src')) + end + + doc.search('iframe[@src]').each do |iframe| + add_url.call(iframe.get_attribute('src')) end end return urls end