lib/spidr/page.rb in spidr-0.1.6 vs lib/spidr/page.rb in spidr-0.1.7
- old
+ new
@@ -21,10 +21,11 @@
# _response_.
#
def initialize(url,response)
@url = url
@response = response
+ @headers = response.to_hash
@doc = nil
end
#
# Returns the response code from the page.
@@ -190,10 +191,12 @@
# <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
# will be returned. Other content-types will cause +nil+ to be
# returned.
#
def doc
+ return nil if (body.nil? || body.empty?)
+
begin
if html?
return @doc ||= Nokogiri::HTML(body)
elsif xml?
return @doc ||= Nokogiri::XML(body)
@@ -207,14 +210,29 @@
# Returns all links from the HTML page.
#
def links
urls = []
- if (html? && self.doc)
- self.doc.search('a[@href]').each do |a|
- url = a.get_attribute('href')
+ add_url = lambda { |url|
+ urls << url unless (url.nil? || url.empty?)
+ }
- urls << url unless url.empty?
+ case code
+ when 300..303, 307
+ add_url.call(@headers['location'])
+ end
+
+ if (html? && doc)
+ doc.search('a[@href]').each do |a|
+ add_url.call(a.get_attribute('href'))
+ end
+
+ doc.search('frame[@src]').each do |iframe|
+ add_url.call(iframe.get_attribute('src'))
+ end
+
+ doc.search('iframe[@src]').each do |iframe|
+ add_url.call(iframe.get_attribute('src'))
end
end
return urls
end